diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 153c630452..ff40b354d8 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -2,6 +2,9 @@ name: continuous build on: [push, pull_request] +permissions: + contents: read # to fetch code (actions/checkout) + jobs: build: runs-on: ${{ matrix.os }} @@ -34,7 +37,7 @@ jobs: - name: Install Dependencies run: | if [ "$RUNNER_OS" == "Linux" ]; then - sudo apt-get install -y gfortran cmake ccache + sudo apt-get install -y gfortran cmake ccache libtinfo5 elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. brew reinstall gcc @@ -150,6 +153,7 @@ jobs: matrix: msystem: [MINGW64, MINGW32, CLANG64] idx: [int32, int64] + build-type: [Release] include: - msystem: MINGW64 idx: int32 @@ -173,6 +177,11 @@ jobs: idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 c-lapack-flags: -DC_LAPACK=ON + - msystem: MINGW64 + idx: int32 + target-prefix: mingw-w64-x86_64 + fc-pkg: mingw-w64-x86_64-gcc-fortran + build-type: None exclude: - msystem: MINGW32 idx: int64 @@ -215,11 +224,11 @@ jobs: path: C:/msys64/home/runneradmin/.ccache # We include the commit sha in the cache key, as new cache entries are # only created if there is no existing entry for the key yet. - key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }} + key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} # Restore a matching ccache cache entry. Prefer same branch. restore-keys: | - ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }} - ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }} + ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} + ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }} - name: Configure ccache # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. @@ -235,7 +244,8 @@ jobs: - name: Configure OpenBLAS run: | mkdir build && cd build - cmake -DBUILD_SHARED_LIBS=ON \ + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \ + -DBUILD_SHARED_LIBS=ON \ -DBUILD_STATIC_LIBS=ON \ -DDYNAMIC_ARCH=ON \ -DUSE_THREAD=ON \ @@ -257,3 +267,54 @@ jobs: - name: Run tests timeout-minutes: 60 run: cd build && ctest + + + cross_build: + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + matrix: + include: + - target: mips64el + triple: mips64el-linux-gnuabi64 + opts: DYNAMIC_ARCH=1 TARGET=GENERIC + - target: riscv64 + triple: riscv64-linux-gnu + opts: TARGET=RISCV64_GENERIC + - target: mipsel + triple: mipsel-linux-gnu + opts: TARGET=MIPS1004K + - target: alpha + triple: alpha-linux-gnu + opts: TARGET=EV4 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install Dependencies + run: | + sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB). + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + + - name: Build OpenBLAS + run: | + make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml new file mode 100644 index 0000000000..de7c0c0f30 --- /dev/null +++ b/.github/workflows/mips64.yml @@ -0,0 +1,117 @@ +name: mips64 qemu test + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: MIPS64_GENERIC + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=MIPS64_GENERIC + - target: SICORTEX + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=SICORTEX + - target: I6400 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6400 + - target: P6600 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=P6600 + - target: I6500 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6500 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross + + - name: checkout qemu + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 + + - name: build qemu + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system + make -j$(nproc) + make install + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-mips64el ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 29ec96f73c..96063565d1 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -17,6 +17,10 @@ on: # it only makes sense to test if this file has been changed name: Nightly-Homebrew-Build + +permissions: + contents: read # to fetch code (actions/checkout) + jobs: build-OpenBLAS-with-Homebrew: runs-on: macos-latest @@ -28,6 +32,8 @@ jobs: HOMEBREW_NO_AUTO_UPDATE: "ON" HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON" HOMEBREW_NO_INSTALL_CLEANUP: "ON" + HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: "ON" + HOMEBREW_NO_INSTALL_FROM_API: "ON" steps: - name: Random delay for cron job diff --git a/.travis.yml b/.travis.yml index 531377456c..a9fc94f5a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,7 +30,7 @@ matrix: before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" script: - - travis_wait 20 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -104,7 +104,7 @@ matrix: - sudo apt-get update - sudo apt-get install gcc-9 gfortran-9 -y script: - - travis_wait 20 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - travis_wait 50 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -121,7 +121,7 @@ matrix: - sudo apt-get update - sudo apt-get install gcc-9 gfortran-9 -y script: - - travis_wait 20 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - travis_wait 50 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -285,6 +285,25 @@ matrix: - gfortran script: - travis_wait 45 make && make lapack-test + env: + - TARGET_BOX=NEOVERSE_N1 + + - &test-neon1-gcc8 + os: linux + arch: arm64 + dist: focal + group: edge + virt: lxd + compiler: gcc + addons: + apt: + packages: + - gcc-8 + - gfortran-8 + script: + - travis_wait 45 make QUIET_MAKE=1 CC=gcc-8 FC=gfortran-8 DYNAMIC_ARCH=1 + env: + - TARGET_BOX=NEOVERSE_N1-GCC8 # whitelist branches: diff --git a/CMakeLists.txt b/CMakeLists.txt index 11e2a922ee..458a594907 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 21) +set(OpenBLAS_PATCH_VERSION 22) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") @@ -36,6 +36,8 @@ option(USE_LOCKING "Use locks even in single-threaded builds to make them callab option(USE_PERL "Use the older PERL scripts for build preparation instead of universal shell scripts" OFF) +option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON) + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() @@ -212,10 +214,10 @@ if(NOT NO_LAPACKE) add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) list(APPEND TARGET_OBJS "$") endif() -if(BUILD_RELAPACK) - add_library(RELAPACK OBJECT ${RELA_SOURCES}) - list(APPEND TARGET_OBJS "$") -endif() +#if(BUILD_RELAPACK) +# add_library(RELAPACK OBJECT ${RELA_SOURCES}) +# list(APPEND TARGET_OBJS "$") +#endif() set(OpenBLAS_LIBS "") if(BUILD_STATIC_LIBS) add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) @@ -236,7 +238,7 @@ endif() set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) # Android needs to explicitly link against libm -if(ANDROID) +if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin") if(BUILD_STATIC_LIBS) target_link_libraries(${OpenBLAS_LIBNAME}_static m) endif() @@ -396,7 +398,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") endif() - if (NOT DEFINED USE_PERL) + if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 1714d90c80..f5e9dda918 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -211,4 +211,5 @@ In chronological order: * PLCT Lab, Institute of Software Chinese Academy of Sciences * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. - \ No newline at end of file +* Pablo Romero + * [2022-08] Fix building from sources for QNX \ No newline at end of file diff --git a/GotoBLAS_01Readme.txt b/GotoBLAS_01Readme.txt index 8635ceb88d..0f05ececbb 100644 --- a/GotoBLAS_01Readme.txt +++ b/GotoBLAS_01Readme.txt @@ -80,7 +80,7 @@ SUN Fujitsu -4. Suported precision +4. Supported precision Now x86/x86_64 version support 80bit FP precision in addition to normal double presicion and single precision. Currently only diff --git a/Makefile b/Makefile index 967ab1bb68..144b3400db 100644 --- a/Makefile +++ b/Makefile @@ -110,6 +110,10 @@ ifeq ($(OSNAME), Darwin) @echo "\"make PREFIX=/your_installation_path/ install\"." @echo @echo "(or set PREFIX in Makefile.rule and run make install." + @echo + @echo "Note that any flags passed to make during build should also be passed to make install" + @echo "to circumvent any install errors." + @echo @echo "If you want to move the .dylib to a new location later, make sure you change" @echo "the internal name of the dylib with:" @echo @@ -118,8 +122,11 @@ endif @echo @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." @echo + @echo "Note that any flags passed to make during build should also be passed to make install" + @echo "to circumvent any install errors." + @echo -shared : +shared : libs netlib $(RELA) ifneq ($(NO_SHARED), 1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) @$(MAKE) -C exports so @@ -143,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT) endif endif -tests : +tests : libs netlib $(RELA) shared ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS @@ -271,7 +278,11 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(F_COMPILER), GFORTRAN) + -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.L3 b/Makefile.L3 new file mode 100644 index 0000000000..76586d826b --- /dev/null +++ b/Makefile.L3 @@ -0,0 +1,5089 @@ +USE_GEMM3M = 0 +OS := $(shell uname) + +ifeq ($(ARCH), x86) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +USE_DIRECT_SGEMM = 1 +endif + +ifeq ($(ARCH), ia64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), arm) +USE_TRMM = 1 +endif + +ifeq ($(ARCH), arm64) +USE_TRMM = 1 +endif + +ifeq ($(ARCH), riscv64) +USE_TRMM = 1 +endif + +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), GENERIC) +USE_TRMM = 1 +endif +endif + +ifeq ($(CORE), HASWELL) +USE_TRMM = 1 +endif + +ifeq ($(CORE), SKYLAKEX) +USE_TRMM = 1 +endif + +ifeq ($(CORE), COOPERLAKE) +USE_TRMM = 1 +endif + +ifeq ($(CORE), SAPPHIRERAPIDS) +USE_TRMM = 1 +endif + +ifeq ($(CORE), ZEN) +USE_TRMM = 1 +endif + +ifeq ($(CORE), POWER8) +ifeq ($(BINARY64),1) +USE_TRMM = 1 +endif +endif + +ifeq ($(CORE), POWER9) +USE_TRMM = 1 +endif + +ifeq ($(CORE), POWER10) +USE_TRMM = 1 +endif + +ifeq ($(ARCH), zarch) +USE_TRMM = 1 +endif + +ifeq ($(CORE), Z14) +USE_TRMM = 1 +endif + +ifdef USE_DIRECT_SGEMM +ifndef SGEMMDIRECTKERNEL +SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c +SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c +endif +endif + +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMMKERNEL +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = ../generic/gemmkernel_2x2.c +SBGEMMINCOPY = ../generic/gemm_ncopy_2.c +SBGEMMITCOPY = ../generic/gemm_tcopy_2.c +SBGEMMONCOPY = ../generic/gemm_ncopy_2.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +SBKERNELOBJS += \ + sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ + $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) +endif + +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" +SKERNELOBJS += \ + sgemm_kernel$(TSUFFIX).$(SUFFIX) \ + sgemm_beta$(TSUFFIX).$(SUFFIX) \ + $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ + $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) + +ifdef USE_DIRECT_SGEMM +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) +endif +endif + +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" +DKERNELOBJS += \ + dgemm_beta$(TSUFFIX).$(SUFFIX) \ + dgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ + $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) +endif + +QKERNELOBJS += \ + qgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ + $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) + +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CKERNELOBJS += \ + cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ + $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) +endif + +ifeq ($(BUILD_COMPLEX16),1) +ZKERNELOBJS += \ + zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ + $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) +endif + +XKERNELOBJS += \ + xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ + $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += $(SBKERNELOBJS) +endif +SBLASOBJS += $(SKERNELOBJS) +DBLASOBJS += $(DKERNELOBJS) +QBLASOBJS += $(QKERNELOBJS) +CBLASOBJS += $(CKERNELOBJS) +ZBLASOBJS += $(ZKERNELOBJS) +XBLASOBJS += $(XKERNELOBJS) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) +endif + +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" +SBLASOBJS += \ + sgemm_beta$(TSUFFIX).$(SUFFIX) \ + strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) +endif + +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" +DBLASOBJS += \ + dgemm_beta$(TSUFFIX).$(SUFFIX) \ + dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) +endif + +QBLASOBJS += \ + qgemm_beta$(TSUFFIX).$(SUFFIX) \ + qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) + +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ + ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ + cgemm_beta$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_COMPLEX16),1) +ZBLASOBJS += \ + zgemm_beta$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif + +XBLASOBJS += \ + xgemm_beta$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) + +ifeq ($(USE_GEMM3M), 1) + +CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX) +ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX) +XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) + +endif + +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" +SBLASOBJS += \ + strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_DOUBLE),1) +DBLASOBJS += \ + dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) +endif + +QBLASOBJS += \ + qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) + +ifeq ($(BUILD_COMPLEX),1) +CBLASOBJS += \ + ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ + ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_COMPLEX16),1) +ZBLASOBJS += \ + ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) +endif + +XBLASOBJS += \ + xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX) + +ifeq ($(USE_GEMM3M), 1) + +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ + cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_COMPLEX16),1) +ZBLASOBJS += \ + zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) +endif + +XBLASOBJS += \ + xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +endif + +###### BLAS small matrix optimization ##### +ifeq ($(SMALL_MATRIX_OPT), 1) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +endif + +SBLASOBJS += \ + sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += \ + cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + +endif + +###### BLAS extensions ##### + +ifeq ($(BUILD_SINGLE),1) +SBLASOBJS += \ + somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + sgeadd_k$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) +DBLASOBJS += \ + domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dgeadd_k$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_COMPLEX),1) +CBLASOBJS += \ + comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cgeadd_k$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_COMPLEX16),1) +ZBLASOBJS += \ + zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zgeadd_k$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +endif + +SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) + +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + +$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(ARCH), E2K) +USE_TRMM = 1 +endif + + +ifeq ($(BUILD_BFLOAT16), 1) + +$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) + +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s + m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ + rm sbgemmotcopy.s sbgemmotcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) + +$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s + m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ + rm sbgemmitcopy.s sbgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + +endif +endif + +$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + m4 sgemmotcopy.s > sgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ + rm sgemmotcopy.s sgemmotcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + +$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + m4 sgemmitcopy.s > sgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ + rm sgemmitcopy.s sgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + +endif + +$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s + m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ + rm dgemm_ncopy.s dgemm_ncopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + +$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s + m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ + rm dgemm_itcopy.s dgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + +endif + +ifdef EXPRECISION + +$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) + +$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) + +$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s + m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ + rm cgemm_itcopy.s cgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + +endif + +$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) + +$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s + m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ + rm zgemm_itcopy.s zgemm_itcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + +endif + +ifdef EXPRECISION + +$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) + +$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s + m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifdef USE_DIRECT_SGEMM +$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifeq ($(BUILD_BFLOAT16), 1) + +$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s + m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif +endif + +$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s + m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + +$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s + m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ + rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ +endif + +$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s + m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ + rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ +endif + +$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s + m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif + +$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s + m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ + rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +endif + +$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s + m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ + rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +endif + +$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s + m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ + rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +endif + +$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s + m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ + rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +endif + +$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s + m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ + rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +endif + +$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ + + +ifdef USE_TRMM +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s + m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ + rm strmmkernel_ln.s strmmkernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s + m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ + rm strmmkernel_lt.s strmmkernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s + m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ + rm strmmkernel_rn.s strmmkernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s + m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ + rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s + m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ + rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s + m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ + rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s + m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ + rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s + m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ + rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s + m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ + rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s + m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ + rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s + m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ + rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s + m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ + rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s + m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ + rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s + m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ + rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s + m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ + rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s + m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ + rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s + m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ + rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s + m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ + rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s + m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ + rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s + m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ + rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s + m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ + rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s + m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ + rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s + m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ + rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif + +else +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif +endif + + + + +$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s + m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ + rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s +else + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ +endif + +$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + + +ifdef STRMMUNCOPY_M +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef STRMMLNCOPY_M +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef STRMMUTCOPY_M +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef STRMMLTCOPY_M +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +ifdef DTRMMUNCOPY_M +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef DTRMMLNCOPY_M +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef DTRMMUTCOPY_M +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef DTRMMLTCOPY_M +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +ifdef CTRMMUNCOPY_M +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef CTRMMLNCOPY_M +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef CTRMMUTCOPY_M +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef CTRMMLTCOPY_M +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +ifdef ZTRMMUNCOPY_M +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRMMLNCOPY_M +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef ZTRMMUTCOPY_M +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRMMLTCOPY_M +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +ifdef SSYMMUCOPY_M +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif + +ifdef SSYMMLCOPY_M +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif + +$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +ifdef DSYMMUCOPY_M +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif + +ifdef DSYMMLCOPY_M +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif + +$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +ifdef CSYMMUCOPY_M +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif + +ifdef CSYMMLCOPY_M +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif + +$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +ifdef ZSYMMUCOPY_M +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif + +ifdef ZSYMMLCOPY_M +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif + +$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +ifdef CHEMMUTCOPY_M +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif + +ifdef CHEMMLTCOPY_M +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif + +$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +ifdef ZHEMMUTCOPY_M +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif + +ifdef ZHEMMLTCOPY_M +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif + +$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +ifdef TRSMCOPYUN_M +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLN_M +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYUT_M +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLT_M +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +ifdef TRSMCOPYUN_M +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLN_M +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYUT_M +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLT_M +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +ifdef ZTRSMCOPYUN_M +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYLN_M +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYUT_M +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYLT_M +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +ifdef ZTRSMCOPYUN_M +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYLN_M +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYUT_M +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYLT_M +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + + +$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + +$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + + +ifeq ($(BUILD_BFLOAT16), 1) +$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) +$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +endif +endif + +$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + +$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) + +$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) + +$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) + +$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) + +$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + + +ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + +$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s + m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif + +$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s + m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + + +$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + + +##### BLAS extensions ###### + +ifndef DOMATCOPY_CN +DOMATCOPY_CN = ../arm/omatcopy_cn.c +endif + +$(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DOMATCOPY_RN +DOMATCOPY_RN = ../arm/omatcopy_rn.c +endif + +$(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DOMATCOPY_CT +DOMATCOPY_CT = ../arm/omatcopy_ct.c +endif + +$(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DOMATCOPY_RT +DOMATCOPY_RT = ../arm/omatcopy_rt.c +endif + +$(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DIMATCOPY_CN +DIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RN +DIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DIMATCOPY_CT +DIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RT +DIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SOMATCOPY_CN +SOMATCOPY_CN = ../arm/omatcopy_cn.c +endif + +$(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SOMATCOPY_RN +SOMATCOPY_RN = ../arm/omatcopy_rn.c +endif + +$(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SOMATCOPY_CT +SOMATCOPY_CT = ../arm/omatcopy_ct.c +endif + +$(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SOMATCOPY_RT +SOMATCOPY_RT = ../arm/omatcopy_rt.c +endif + +$(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SIMATCOPY_CN +SIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RN +SIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SIMATCOPY_CT +SIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RT +SIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + + +ifndef COMATCOPY_CN +COMATCOPY_CN = ../arm/zomatcopy_cn.c +endif + +$(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef COMATCOPY_RN +COMATCOPY_RN = ../arm/zomatcopy_rn.c +endif + +$(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef COMATCOPY_CT +COMATCOPY_CT = ../arm/zomatcopy_ct.c +endif + +$(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef COMATCOPY_RT +COMATCOPY_RT = ../arm/zomatcopy_rt.c +endif + +$(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef COMATCOPY_CNC +COMATCOPY_CNC = ../arm/zomatcopy_cnc.c +endif + +$(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef COMATCOPY_RNC +COMATCOPY_RNC = ../arm/zomatcopy_rnc.c +endif + +$(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef COMATCOPY_CTC +COMATCOPY_CTC = ../arm/zomatcopy_ctc.c +endif + +$(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef COMATCOPY_RTC +COMATCOPY_RTC = ../arm/zomatcopy_rtc.c +endif + +$(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_CN +CIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RN +CIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CT +CIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RT +CIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CNC +CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RNC +CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_CTC +CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RTC +CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + + + +ifndef ZOMATCOPY_CN +ZOMATCOPY_CN = ../arm/zomatcopy_cn.c +endif + +$(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZOMATCOPY_RN +ZOMATCOPY_RN = ../arm/zomatcopy_rn.c +endif + +$(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZOMATCOPY_CT +ZOMATCOPY_CT = ../arm/zomatcopy_ct.c +endif + +$(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZOMATCOPY_RT +ZOMATCOPY_RT = ../arm/zomatcopy_rt.c +endif + +$(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZOMATCOPY_CNC +ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c +endif + +$(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZOMATCOPY_RNC +ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c +endif + +$(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZOMATCOPY_CTC +ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c +endif + +$(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZOMATCOPY_RTC +ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c +endif + +$(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_CN +ZIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RN +ZIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CT +ZIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RT +ZIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CNC +ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RNC +ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_CTC +ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RTC +ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + + +ifndef SGEADD_K +SGEADD_K = ../generic/geadd.c +endif + +$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DGEADD_K +DGEADD_K = ../generic/geadd.c +endif + +$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef CGEADD_K +CGEADD_K = ../generic/zgeadd.c +endif + +$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ + +ifndef ZGEADD_K +ZGEADD_K = ../generic/zgeadd.c +endif + +$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ + + + +###### BLAS small matrix optimization ##### + +ifndef DGEMM_SMALL_M_PERMIT +DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef DGEMM_SMALL_K_NN +DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef DGEMM_SMALL_K_NT +DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef DGEMM_SMALL_K_TN +DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef DGEMM_SMALL_K_TT +DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifndef DGEMM_SMALL_K_B0_NN +DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef DGEMM_SMALL_K_B0_NT +DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef DGEMM_SMALL_K_B0_TN +DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef DGEMM_SMALL_K_B0_TT +DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +ifndef SGEMM_SMALL_M_PERMIT +SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SGEMM_SMALL_K_NN +SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SGEMM_SMALL_K_NT +SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SGEMM_SMALL_K_TN +SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SGEMM_SMALL_K_TT +SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SGEMM_SMALL_K_B0_NN +SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SGEMM_SMALL_K_B0_NT +SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SGEMM_SMALL_K_B0_TN +SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SGEMM_SMALL_K_B0_TT +SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + + +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMM_SMALL_M_PERMIT +SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SBGEMM_SMALL_K_NN +SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SBGEMM_SMALL_K_NT +SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SBGEMM_SMALL_K_TN +SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SBGEMM_SMALL_K_TT +SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SBGEMM_SMALL_K_B0_NN +SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SBGEMM_SMALL_K_B0_NT +SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SBGEMM_SMALL_K_B0_TN +SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SBGEMM_SMALL_K_B0_TT +SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ +endif + +ifndef CGEMM_SMALL_M_PERMIT +CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif + +ifndef CGEMM_SMALL_K_NN +CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef CGEMM_SMALL_K_NT +CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef CGEMM_SMALL_K_TN +CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef CGEMM_SMALL_K_TT +CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ + +$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ + +$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef CGEMM_SMALL_K_B0_NN +CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef CGEMM_SMALL_K_B0_NT +CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef CGEMM_SMALL_K_B0_TN +CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef CGEMM_SMALL_K_B0_TT +CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ + +ifndef ZGEMM_SMALL_M_PERMIT +ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif + +ifndef ZGEMM_SMALL_K_NN +ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef ZGEMM_SMALL_K_NT +ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef ZGEMM_SMALL_K_TN +ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef ZGEMM_SMALL_K_TT +ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + + +$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ + +$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ + +$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef ZGEMM_SMALL_K_B0_NN +ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef ZGEMM_SMALL_K_B0_NT +ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef ZGEMM_SMALL_K_B0_TN +ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef ZGEMM_SMALL_K_B0_TT +ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ diff --git a/Makefile.alpha b/Makefile.alpha index bd4f4d58be..97e4d757e7 100644 --- a/Makefile.alpha +++ b/Makefile.alpha @@ -1,42 +1,24 @@ -CPP = $(CC) -E -RANLIB = ranlib - -ifeq ($(LIBSUBARCH), EV4) -LIBNAME = $(LIBPREFIX)_ev4.a -LIBNAME_P = $(LIBPREFIX)_ev4_p.a -endif - -ifeq ($(LIBSUBARCH), EV5) -LIBNAME = $(LIBPREFIX)_ev5.a -LIBNAME_P = $(LIBPREFIX)_ev5_p.a -endif - -ifeq ($(LIBSUBARCH), EV6) -LIBNAME = $(LIBPREFIX)_ev6.a -LIBNAME_P = $(LIBPREFIX)_ev6_p.a -endif - ifneq ($(COMPILER), NATIVE) # GCC User -ifeq ($(LIBSUBARCH), EV4) -OPTION += -DEV4 -mcpu=ev4 +ifeq ($(CORE), EV4) +CCOMMON_OPT += -mcpu=ev4 endif -ifeq ($(LIBSUBARCH), EV5) -OPTION += -DEV5 -mcpu=ev5 +ifeq ($(CORE), EV5) +CCOMMON_OPT += -mcpu=ev5 endif -ifeq ($(LIBSUBARCH), EV6) -OPTION += -DEV6 -mcpu=ev6 +ifeq ($(CORE), EV6) +CCOMMON_OPT += -mcpu=ev6 endif else # Compaq Compiler User -ifeq ($(LIBSUBARCH), EV4) -OPTION += -DEV4 -tune ev4 -arch ev4 +ifeq ($(CORE), EV4) +CCOMMON_OPT += -tune ev4 -arch ev4 endif -ifeq ($(LIBSUBARCH), EV5) -OPTION += -DEV5 -tune ev5 -arch ev5 +ifeq ($(CORE), EV5) +CCOMMON_OPT += -tune ev5 -arch ev5 endif -ifeq ($(LIBSUBARCH), EV6) -OPTION += -DEV6 -tune ev6 -arch ev6 +ifeq ($(CORE), EV6) +CCOMMON_OPT += -tune ev6 -arch ev6 endif endif diff --git a/Makefile.arm64 b/Makefile.arm64 index 4efa552860..064e84cbb4 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -89,17 +89,17 @@ endif endif # Use a72 tunings because Neoverse-V1 is only available -# in GCC>=9.4 +# in GCC>=10.4 ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) -CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a -mtune=native +CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=native endif @@ -119,17 +119,21 @@ endif endif # Use a72 tunings because Neoverse-N2 is only available -# in GCC>=9.4 +# in GCC>=10.4 ifeq ($(CORE), NEOVERSEN2) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) -ifeq ($(GCCVERSIONGTEQ9), 1) -ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +ifeq ($(GCCVERSIONGTEQ10), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) +ifneq ($(OSNAME), Darwin) CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 endif else -CCOMMON_OPT += -march=armv8.5-a -mtune=native +CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.5-a -mtune=native endif diff --git a/Makefile.install b/Makefile.install index 28727de37a..87b5bc8701 100644 --- a/Makefile.install +++ b/Makefile.install @@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(INTERFACE64),1) + SUFFIX64=64 +endif +PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" + ifeq ($(USE_OPENMP), 1) ifeq ($(C_COMPILER), PGI) PKG_EXTRALIB += -lomp @@ -150,13 +155,19 @@ endif endif #Generating openblas.pc - @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" - @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" +ifeq ($(INTERFACE64),1) + SUFFIX64=64 +endif + PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" + + @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" + @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" + @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" + @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" + @echo 'version='$(VERSION) >> "$(PKGFILE)" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" + @cat openblas.pc.in >> "$(PKGFILE)" #Generating OpenBLASConfig.cmake diff --git a/Makefile.mips b/Makefile.mips index 05ea9c679d..ecc04585a6 100644 --- a/Makefile.mips +++ b/Makefile.mips @@ -1,3 +1,4 @@ +MSA_FLAGS = -mmsa -mfp64 -mload-store-pairs ifdef BINARY64 else endif diff --git a/Makefile.mips64 b/Makefile.mips64 index 05ea9c679d..ecc04585a6 100644 --- a/Makefile.mips64 +++ b/Makefile.mips64 @@ -1,3 +1,4 @@ +MSA_FLAGS = -mmsa -mfp64 -mload-store-pairs ifdef BINARY64 else endif diff --git a/Makefile.prebuild b/Makefile.prebuild index c59e9049c4..0be4f12741 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -60,9 +60,9 @@ all: getarch_2nd ./getarch_2nd 1 >> $(TARGET_CONF) $(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch - ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS) + ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS) ifneq ($(ONLY_CBLAS), 1) - ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) + ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS) else #When we only build CBLAS, we set NOFORTRAN=2 echo "NOFORTRAN=2" >> $(TARGET_MAKE) @@ -77,8 +77,8 @@ endif getarch : getarch.c cpuid.S dummy $(CPUIDEMU) - avx512=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ - rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ + avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ + rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy diff --git a/Makefile.rule b/Makefile.rule index 5f787a9c54..2859d9edcb 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.21 +VERSION = 0.3.22 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 +# Have RecursiveLAPACK actually replace standard LAPACK routines instead of +# just adding its equivalents with a RELAPACK_ prefix +# RELAPACK_REPLACE = 1 # If you want to use the legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -207,7 +210,7 @@ NO_AFFINITY = 1 # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 -# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). # CONSISTENT_FPCSR = 1 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute diff --git a/Makefile.system b/Makefile.system index bada954c1f..343b94bb36 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,10 @@ ifndef TOPDIR TOPDIR = . endif +ifndef RELAPACK_REPLACE +RELAPACK_REPLACE=0 +endif + # we need to use the host system's architecture for getarch compile options even especially when cross-compiling HOSTARCH := $(shell uname -m) ifeq ($(HOSTARCH), amd64) @@ -280,8 +284,10 @@ GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) # Generating Makefile.conf and config.h DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +endif + ifndef TARGET_CORE -include $(TOPDIR)/Makefile.conf +-include $(TOPDIR)/Makefile.conf else HAVE_NEON= HAVE_VFP= @@ -302,7 +308,6 @@ HAVE_FMA3= include $(TOPDIR)/Makefile_kernel.conf endif -endif ifndef NUM_PARALLEL NUM_PARALLEL = 1 @@ -415,7 +420,7 @@ ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif -ifeq ($(OSNAME), FreeBSD) +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) EXTRALIB += -lm endif @@ -660,8 +665,10 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +endif DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX @@ -677,7 +684,12 @@ endif endif ifeq ($(ARCH), mips64) -DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +endif endif ifeq ($(ARCH), loongarch64) @@ -818,13 +830,32 @@ endif ifeq ($(ARCH), riscv64) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fdefault-integer-8 +endif +ifeq ($(F_COMPILER), FLANG) +FCOMMON_OPT += -i8 +endif +endif +endif endif ifeq ($(ARCH), loongarch64) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fdefault-integer-8 +endif +ifeq ($(F_COMPILER), FLANG) +FCOMMON_OPT += -i8 +endif +endif +endif endif - # # C Compiler dependent settings @@ -856,6 +887,11 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif +ifneq (, $(filter $(CORE), MIPS64_GENERIC)) +CCOMMON_OPT += -DNO_MSA +FCOMMON_OPT += -DNO_MSA +endif + ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) CCOMMON_OPT += -march=loongson3a FCOMMON_OPT += -march=loongson3a @@ -932,16 +968,19 @@ endif endif ifdef BINARY64 ifeq ($(ARCH), x86_64) +ifeq (,$(findstring tp,$(CFLAGS))) ifneq ($(NEWPGI2),1) CCOMMON_OPT += -tp p7-64 else CCOMMON_OPT += -tp px endif +endif ifneq ($(NEWPGI),1) CCOMMON_OPT += -D__MMX__ -Mnollvm endif else ifeq ($(ARCH), power) +ifeq (,$(findstring tp,$(CFLAGS))) ifeq ($(CORE), POWER8) CCOMMON_OPT += -tp pwr8 endif @@ -950,14 +989,17 @@ CCOMMON_OPT += -tp pwr9 endif endif endif +endif else ifneq ($(NEWPGI2),1) +ifeq (,$(findstring tp,$(CFLAGS))) CCOMMON_OPT += -tp p7 else CCOMMON_OPT += -tp px endif endif endif +endif ifeq ($(C_COMPILER), PATHSCALE) ifdef BINARY64 @@ -1349,6 +1391,10 @@ ifeq ($(NO_AVX512), 1) CCOMMON_OPT += -DNO_AVX512 endif +ifeq ($(NO_SVE), 1) +CCOMMON_OPT += -DNO_SVE +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f14a8a8ffe..7ab331b1f0 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -130,6 +130,28 @@ endif endif endif +ifeq ($(CORE), ZEN) +ifdef HAVE_AVX512VL +ifndef NO_AVX512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif +endif +endif + + ifdef HAVE_AVX2 ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) @@ -143,6 +165,7 @@ ifeq ($(C_COMPILER), CLANG) CCOMMON_OPT += -mavx2 endif endif +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) ifeq ($(F_COMPILER), GFORTRAN) # AVX2 support was added in 4.7.0 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) @@ -159,6 +182,7 @@ endif endif endif endif +endif endif diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..6c0731597f --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,20 @@ +# Security Policy + +## Supported Versions + +It is generally recommended to use the latest release as this project +does not maintain multiple stable branches and providing packages e.g. +for Linux distributions is outside our scope. In particular, versions +before 0.3.18 can be assumed to carry the out-of-bounds-read error in +the LAPACK ?LARRV family of functions that was the subject of +CVE-2021-4048 + +## Reporting a Vulnerability + +If you suspect that you have found a vulnerability - a defect that could +be abused to compromise the security of a user's code or systems - please +do not use the normal github issue tracker (except perhaps to post a general +warning if you deem that necessary). Instead, please contact the project +maintainers through the email addresses given in their github user profiles. +Defects found in the "lapack-netlib" subtree should ideally be reported to +the maintainers of the reference implementation of LAPACK, lapack@icl.itk.edu diff --git a/TargetList.txt b/TargetList.txt index d17caf4808..deef758195 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -65,6 +65,7 @@ MIPS1004K MIPS24K 4.MIPS64 CPU: +MIPS64_GENERIC SICORTEX LOONGSON3A LOONGSON3B @@ -128,3 +129,7 @@ LOONGSON2K1000 12. Elbrus E2000: E2K +13. Alpha +EV4 +EV5 +EV6 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1102bf0f5f..16b9da4f59 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -141,7 +141,7 @@ jobs: - job: OSX_OpenMP pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -151,15 +151,23 @@ jobs: - job: OSX_GCC_Nothreads pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 +- job: OSX_GCC12 + pool: + vmImage: 'macOS-latest' + steps: + - script: | + brew update + make CC=gcc-12 FC=gfortran-12 + - job: OSX_OpenMP_Clang pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -172,7 +180,7 @@ jobs: - job: OSX_OpenMP_Clang_cmake pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -188,7 +196,7 @@ jobs: - job: OSX_dynarch_cmake pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -196,13 +204,13 @@ jobs: - script: | mkdir build cd build - cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. cmake --build . ctest - job: OSX_Ifort_Clang pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg @@ -235,7 +243,7 @@ jobs: - job: OSX_NDK_ARMV7 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' steps: - script: | brew update @@ -255,7 +263,7 @@ jobs: - job: OSX_IOS_ARMV7 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 diff --git a/benchmark/Makefile b/benchmark/Makefile index f2f3b354a4..d9ddb90424 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -1,3439 +1,3439 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -# ACML standard -#ACML=/opt/acml5.3.1/gfortran64_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML custom -#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib -#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm - -# ACML 6.1 custom -ACML=/home/saar/acml6.1/gfortran64_mp/lib -LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - - -# Atlas Ubuntu -#ATLAS=/usr/lib/atlas-base -#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Atlas RHEL and Fedora -ATLAS=/usr/lib64/atlas -LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm - -# Intel standard -# MKL=/opt/intel/mkl/lib/intel64 -# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Intel custom -MKL=/home/saar/intel_mkl -LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm - -# Apple vecLib -LIBVECLIB = -framework Accelerate - -ESSL=/opt/ibm/lib -#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a -LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a - -ifneq ($(NO_LAPACK), 1) -GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - csymv.goto zsymv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto -else -GOTO_LAPACK_TARGETS= -endif - -ifeq ($(BUILD_BFLOAT16),1) -GOTO_HALF_TARGETS=sbgemm.goto -else -GOTO_HALF_TARGETS= -endif - -ifeq ($(OSNAME), WINNT) - -goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ - scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ - sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto csymv.goto zsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - cher.goto zher.goto \ - cher2.goto zher2.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ - stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ - sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ - sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ - spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - cher.acml zher.acml \ - cher2.acml zher2.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ - stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - cher.atlas zher.atlas \ - cher2.atlas zher2.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ - stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl \ - srot.mkl drot.mkl csrot.mkl zdrot.mkl \ - srotm.mkl drotm.mkl \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - cher.mkl zher.mkl \ - cher2.mkl zher2.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ - stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl - -else - -goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ - strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ - strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ - ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ - ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ - sger.goto dger.goto cger.goto zger.goto \ - sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ - saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ - scopy.goto dcopy.goto ccopy.goto zcopy.goto \ - sswap.goto dswap.goto cswap.goto zswap.goto \ - sscal.goto dscal.goto cscal.goto zscal.goto \ - sasum.goto dasum.goto casum.goto zasum.goto \ - ssymv.goto dsymv.goto \ - chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ - chemm.goto zhemm.goto \ - cherk.goto zherk.goto \ - cher2k.goto zher2k.goto \ - cher.goto zher.goto \ - cher2.goto zher2.goto \ - sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ - stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling \ - isamax.goto idamax.goto icamax.goto izamax.goto \ - ismax.goto idmax.goto \ - isamin.goto idamin.goto icamin.goto izamin.goto \ - ismin.goto idmin.goto \ - samax.goto damax.goto camax.goto zamax.goto \ - smax.goto dmax.goto \ - samin.goto damin.goto camin.goto zamin.goto \ - smin.goto dmin.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) - -acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ - scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ - sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ - strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ - strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ - ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ - ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ - sger.acml dger.acml cger.acml zger.acml \ - sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ - saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ - scopy.acml dcopy.acml ccopy.acml zcopy.acml \ - sswap.acml dswap.acml cswap.acml zswap.acml \ - sscal.acml dscal.acml cscal.acml zscal.acml \ - sasum.acml dasum.acml casum.acml zasum.acml \ - ssymv.acml dsymv.acml csymv.acml zsymv.acml \ - chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ - chemm.acml zhemm.acml \ - cherk.acml zherk.acml \ - cher2k.acml zher2k.acml \ - cher.acml zher.acml \ - cher2.acml zher2.acml \ - sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ - stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ - sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ - sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ - sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ - spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml - -atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ - scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ - sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ - strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ - strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ - ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ - ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ - sger.atlas dger.atlas cger.atlas zger.atlas\ - sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ - scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ - sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ - sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ - sasum.atlas dasum.atlas casum.atlas zasum.atlas \ - ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ - chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ - chemm.acml zhemm.acml \ - chemm.atlas zhemm.atlas \ - cherk.atlas zherk.atlas \ - cher2k.atlas zher2k.atlas \ - cher.atlas zher.atlas \ - cher2.atlas zher2.atlas \ - sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ - stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ - sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ - sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ - sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ - spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas - -mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ - scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ - sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ - strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ - strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ - ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ - ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ - sger.mkl dger.mkl cger.mkl zger.mkl \ - sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ - saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ - scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ - sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ - sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ - sasum.mkl dasum.mkl casum.mkl zasum.mkl \ - ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ - chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ - chemm.mkl zhemm.mkl \ - cherk.mkl zherk.mkl \ - cher2k.mkl zher2k.mkl \ - cher.mkl zher.mkl \ - cher2.mkl zher2.mkl \ - sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ - stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ - sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ - sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ - sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ - spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl - - - - -endif - -essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ - cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ - slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ - scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ - strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl - -veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ - scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ - sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ - strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ - strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ - sspr.veclib dspr.veclib \ - sspr2.veclib dspr2.veclib \ - ssyr.veclib dsyr.veclib \ - ssyr2.veclib dsyr2.veclib \ - ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ - ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ - sger.veclib dger.veclib cger.veclib zger.veclib \ - sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ - srot.veclib drot.veclib csrot.veclib zdrot.veclib \ - srotm.veclib drotm.veclib \ - saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ - scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ - sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ - sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ - sasum.veclib dasum.veclib casum.veclib zasum.veclib \ - ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ - chemv.veclib zhemv.veclib \ - chbmv.veclib zhbmv.veclib \ - chpmv.veclib zhpmv.veclib \ - chemm.veclib zhemm.veclib \ - cherk.veclib zherk.veclib \ - cher2k.veclib zher2k.veclib \ - cher.veclib zher.veclib \ - cher2.veclib zher2.veclib \ - sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ - stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ - stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ - strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ - sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ - sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ - sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ - spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ - saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib - -goto_3m :: cgemm3m.goto zgemm3m.goto - -mkl_3m :: cgemm3m.mkl zgemm3m.mkl - -all :: goto mkl atlas acml veclib - -exe : - @./Make_exe.sh - -##################################### Slinpack #################################################### -slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -slinpack.acml : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.atlas : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.mkl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.veclib : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -slinpack.essl : slinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dlinpack #################################################### -dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dlinpack.acml : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.atlas : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.mkl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.veclib : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dlinpack.essl : dlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Clinpack #################################################### - -clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -clinpack.acml : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.atlas : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.mkl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.veclib : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -clinpack.essl : clinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zlinpack #################################################### - -zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zlinpack.acml : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.atlas : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.mkl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.veclib : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zlinpack.essl : zlinpack.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scholesky ################################################### - -scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scholesky.acml : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.atlas : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.mkl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.veclib : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scholesky.essl : scholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcholesky ################################################### - -dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcholesky.acml : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.atlas : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.mkl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.veclib : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcholesky.essl : dcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccholesky ################################################### - -ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccholesky.acml : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.atlas : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.mkl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.veclib : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccholesky.essl : ccholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Zcholesky ################################################### - -zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcholesky.acml : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.atlas : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.mkl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.veclib : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcholesky.essl : zcholesky.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemm #################################################### -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -endif - -sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemm.acml : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.atlas : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.mkl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.veclib : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemm.essl : sgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemm #################################################### -dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemm.acml : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.atlas : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.mkl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.veclib : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemm.essl : dgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemm #################################################### - -cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm.acml : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.atlas : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.mkl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.veclib : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm.essl : cgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm #################################################### - -zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm.acml : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.atlas : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.mkl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.veclib : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm.essl : zgemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymm #################################################### -ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymm.acml : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.atlas : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.mkl : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymm.veclib : ssymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymm #################################################### -dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymm.acml : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.atlas : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.mkl : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymm.veclib : dsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymm #################################################### - -csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymm.acml : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.atlas : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.mkl : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymm.veclib : csymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsymm #################################################### - -zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymm.acml : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.atlas : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.mkl : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymm.veclib : zsymm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmm #################################################### -strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmm.acml : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.atlas : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.mkl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.veclib : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmm.essl : strmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmm #################################################### -dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmm.acml : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.atlas : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.mkl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.veclib : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmm.essl : dtrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmm #################################################### - -ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmm.acml : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.atlas : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.mkl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.veclib : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmm.essl : ctrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmm #################################################### - -ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmm.acml : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.atlas : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.mkl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.veclib : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmm.essl : ztrmm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsm #################################################### -strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsm.acml : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.atlas : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.mkl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.veclib : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsm.essl : strsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsm #################################################### -dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsm.acml : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.atlas : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.mkl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.veclib : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsm.essl : dtrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsm #################################################### - -ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsm.acml : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.atlas : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.mkl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.veclib : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsm.essl : ctrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsm #################################################### - -ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsm.acml : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.atlas : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.mkl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.veclib : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsm.essl : ztrsm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ssyr #################################################### -ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr.acml : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.atlas : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.mkl : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.veclib : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr #################################################### -dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr.acml : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.atlas : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.mkl : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.veclib : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr #################################################### -sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr.acml : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.atlas : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.mkl : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.veclib : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr #################################################### -dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr.acml : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.atlas : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.mkl : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.veclib : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr2 #################################################### -sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr2.acml : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.atlas : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.mkl : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.veclib : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr2 #################################################### -dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr2.acml : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.atlas : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.mkl : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.veclib : dspr2.$(SUFFIX) - -##################################### Ssyr2 #################################################### -ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2.acml : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.atlas : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.mkl : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.veclib : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr2 #################################################### -dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2.acml : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.atlas : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.mkl : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.veclib : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyrk #################################################### -ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyrk.acml : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.atlas : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.mkl : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyrk.veclib : ssyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyrk #################################################### -dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyrk.acml : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.atlas : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.mkl : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyrk.veclib : dsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyrk #################################################### - -csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyrk.acml : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.atlas : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.mkl : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyrk.veclib : csyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyrk #################################################### - -zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyrk.acml : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.atlas : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.mkl : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyrk.veclib : zsyrk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssyr2k #################################################### -ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2k.acml : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.atlas : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.mkl : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2k.veclib : ssyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsyr2k #################################################### -dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2k.acml : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.atlas : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.mkl : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2k.veclib : dsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csyr2k #################################################### - -csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csyr2k.acml : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.atlas : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.mkl : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csyr2k.veclib : csyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zsyr2k #################################################### - -zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsyr2k.acml : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.atlas : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.mkl : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsyr2k.veclib : zsyr2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemm #################################################### - -chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemm.acml : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.atlas : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.mkl : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemm.veclib : chemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemm #################################################### - -zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemm.acml : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.atlas : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.mkl : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemm.veclib : zhemm.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cherk #################################################### - -cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cherk.acml : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.atlas : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.mkl : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cherk.veclib : cherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zherk #################################################### - -zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zherk.acml : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.atlas : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.mkl : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zherk.veclib : zherk.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2k #################################################### - -cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2k.acml : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.atlas : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.mkl : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2k.veclib : cher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2k #################################################### - -zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2k.acml : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.atlas : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.mkl : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2k.veclib : zher2k.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher #################################################### - -cher.goto : cher.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher.acml : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.atlas : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.mkl : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher.veclib : cher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher #################################################### - -zher.goto : zher.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher.acml : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.atlas : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.mkl : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher.veclib : zher.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cher2 #################################################### - -cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cher2.acml : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.atlas : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.mkl : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cher2.veclib : cher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zher2 #################################################### - -zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zher2.acml : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.atlas : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.mkl : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zher2.veclib : zher2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgemv.acml : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.atlas : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.mkl : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgemv.veclib : sgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgemv.acml : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.atlas : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.mkl : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgemv.veclib : dgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemv.acml : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.atlas : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.mkl : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemv.veclib : cgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemv.acml : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.atlas : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.mkl : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemv.veclib : zgemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspmv #################################################### -sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspmv.atlas : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspmv #################################################### -dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspmv.atlas : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strmv #################################################### -strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmv.acml : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.atlas : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.mkl : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.veclib : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmv #################################################### -dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmv.acml : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.atlas : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.mkl : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.veclib : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmv #################################################### - -ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmv.acml : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.atlas : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.mkl : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.veclib : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmv #################################################### - -ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmv.acml : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.atlas : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.mkl : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.veclib : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Stpmv #################################################### -stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -stpmv.acml : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.atlas : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.mkl : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpmv.veclib : stpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtpmv #################################################### -dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtpmv.acml : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.atlas : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.mkl : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpmv.veclib : dtpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctpmv #################################################### - -ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctpmv.acml : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.atlas : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.mkl : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpmv.veclib : ctpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztpmv #################################################### - -ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztpmv.acml : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.atlas : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.mkl : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpmv.veclib : ztpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Stpsv #################################################### -stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -stpsv.acml : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.atlas : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.mkl : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -stpsv.veclib : stpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtpsv #################################################### -dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtpsv.acml : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.atlas : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.mkl : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtpsv.veclib : dtpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctpsv #################################################### - -ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctpsv.acml : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.atlas : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.mkl : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctpsv.veclib : ctpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztpsv #################################################### - -ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztpsv.acml : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.atlas : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.mkl : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztpsv.veclib : ztpsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsv #################################################### -strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsv.acml : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.atlas : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.mkl : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.veclib : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsv #################################################### -dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsv.acml : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.atlas : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.mkl : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.veclib : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsv #################################################### - -ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsv.acml : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.atlas : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.mkl : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.veclib : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsv #################################################### - -ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsv.acml : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.atlas : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.mkl : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.veclib : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sger #################################################### -sger.goto : sger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sger.acml : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.atlas : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.mkl : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sger.veclib : sger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dger #################################################### -dger.goto : dger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dger.acml : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.atlas : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.mkl : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dger.veclib : dger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cger #################################################### -cger.goto : cger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cger.acml : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.atlas : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.mkl : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cger.veclib : cger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zger #################################################### -zger.goto : zger.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zger.acml : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.atlas : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.mkl : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zger.veclib : zger.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ssymv #################################################### -ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssymv.acml : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.atlas : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.mkl : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssymv.veclib : ssymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsymv.acml : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.atlas : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.mkl : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsymv.veclib : dsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Csymv #################################################### -csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csymv.acml : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.atlas : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.mkl : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csymv.veclib : csymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dsymv #################################################### -zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zsymv.acml : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.atlas : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.mkl : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zsymv.veclib : zsymv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgeev #################################################### -sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgeev.acml : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.atlas : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.mkl : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgeev.veclib : sgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgeev #################################################### -dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgeev.acml : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.atlas : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.mkl : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgeev.veclib : dgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgeev #################################################### - -cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgeev.acml : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.atlas : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.mkl : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgeev.veclib : cgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgeev #################################################### - -zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgeev.acml : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.atlas : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.mkl : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgeev.veclib : zgeev.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sgetri #################################################### -sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgetri.acml : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.atlas : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.mkl : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgetri.veclib : sgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgetri #################################################### -dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgetri.acml : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.atlas : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.mkl : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgetri.veclib : dgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgetri #################################################### - -cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgetri.acml : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.atlas : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.mkl : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgetri.veclib : cgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgetri #################################################### - -zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) - $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgetri.acml : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.atlas : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.mkl : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgetri.veclib : zgetri.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Spotrf #################################################### -spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -spotrf.acml : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.atlas : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.mkl : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -spotrf.veclib : spotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dpotrf #################################################### -dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dpotrf.acml : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.atlas : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.mkl : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dpotrf.veclib : dpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cpotrf #################################################### - -cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cpotrf.acml : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.atlas : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.mkl : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cpotrf.veclib : cpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zpotrf #################################################### - -zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zpotrf.acml : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.atlas : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.mkl : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zpotrf.veclib : zpotrf.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Chemv #################################################### - -chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chemv.acml : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.atlas : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.mkl : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chemv.veclib : chemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zhemv #################################################### - -zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhemv.acml : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.atlas : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.mkl : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhemv.veclib : zhemv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chbmv #################################################### - -chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chbmv.acml : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.atlas : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.mkl : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.veclib : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhbmv #################################################### - -zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhbmv.acml : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.atlas : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.mkl : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.veclib : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chpmv #################################################### - -chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chpmv.acml : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.atlas : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.mkl : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.veclib : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhpmv #################################################### - -zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhpmv.acml : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.atlas : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.mkl : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.veclib : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sdot.acml : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.atlas : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.mkl : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sdot.veclib : sdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ddot.acml : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.atlas : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.mkl : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ddot.veclib : ddot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cdot.acml : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.atlas : cdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.mkl : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cdot.veclib : cdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdot.acml : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.atlas : zdot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.mkl : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdot.veclib : zdot-intel.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srot.acml : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.atlas : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.mkl : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srot.veclib : srot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drot.acml : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.atlas : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.mkl : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drot.veclib : drot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### csrot #################################################### -csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csrot.acml : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.atlas : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.mkl : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.veclib : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### zdrot #################################################### -zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdrot.acml : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.atlas : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.mkl : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.veclib : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### srotm #################################################### -srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srotm.acml : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.atlas : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.mkl : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.veclib : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### drotm #################################################### -drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drotm.acml : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.atlas : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.mkl : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.veclib : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpy.acml : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.atlas : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.mkl : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpy.veclib : saxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpy.acml : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.atlas : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.mkl : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpy.veclib : daxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpy.acml : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.atlas : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.mkl : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpy.veclib : caxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpy.acml : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.atlas : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.mkl : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpy.veclib : zaxpy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Saxpby #################################################### -saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -saxpby.acml : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.atlas : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.mkl : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.veclib : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpby #################################################### -daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpby.acml : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.atlas : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.mkl : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.veclib : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpby #################################################### - -caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpby.acml : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.atlas : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.mkl : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.veclib : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpby #################################################### - -zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpby.acml : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.atlas : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.mkl : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.veclib : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scopy.acml : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.atlas : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.mkl : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -scopy.veclib : scopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dcopy.acml : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.atlas : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.mkl : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dcopy.veclib : dcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ccopy.acml : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.atlas : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.mkl : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ccopy.veclib : ccopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zcopy.acml : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.atlas : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.mkl : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zcopy.veclib : zcopy.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sscal.acml : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.atlas : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.mkl : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sscal.veclib : sscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dscal.acml : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.atlas : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.mkl : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dscal.veclib : dscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cscal.acml : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.atlas : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.mkl : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cscal.veclib : cscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zscal.acml : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.atlas : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.mkl : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zscal.veclib : zscal.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sasum.acml : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.atlas : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.mkl : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sasum.veclib : sasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dasum.acml : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.atlas : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.mkl : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dasum.veclib : dasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -casum.acml : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.atlas : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.mkl : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -casum.veclib : casum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zasum.acml : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.atlas : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.mkl : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zasum.veclib : zasum.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sswap.acml : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.atlas : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.mkl : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sswap.veclib : sswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dswap.acml : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.atlas : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.mkl : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dswap.veclib : dswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cswap.acml : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.atlas : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.mkl : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cswap.veclib : cswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zswap.acml : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.atlas : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.mkl : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zswap.veclib : zswap.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Sgesv #################################################### -sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sgesv.acml : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.atlas : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.mkl : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sgesv.veclib : sgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dgesv #################################################### -dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dgesv.acml : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.atlas : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.mkl : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dgesv.veclib : dgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cgesv #################################################### - -cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgesv.acml : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.atlas : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.mkl : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgesv.veclib : cgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgesv #################################################### - -zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgesv.acml : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.atlas : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.mkl : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgesv.veclib : zgesv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -##################################### Cgemm3m #################################################### - -cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cgemm3m.mkl : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cgemm3m.veclib : cgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zgemm3m #################################################### - -zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zgemm3m.mkl : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zgemm3m.veclib : zgemm3m.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -isamax.atlas : isamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -idamax.atlas : idamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -icamax.atlas : icamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -izamax.atlas : izamax.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMAX ############################################## -camax.goto : camax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMAX ############################################## -zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SNRM2 ############################################## -snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -snrm2.atlas : snrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## DNRM2 ############################################## -dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dnrm2.atlas : dnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Sscnrm2 ############################################## -scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -scnrm2.atlas : scnrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -############################################## Ddznrm2 ############################################## -dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dznrm2.atlas : dznrm2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - -################################################################################################### - -slinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -clinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zlinpack.$(SUFFIX) : linpack.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcholesky.$(SUFFIX) : cholesky.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ -endif - -sgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymm.$(SUFFIX) : symm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmm.$(SUFFIX) : trmm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsm.$(SUFFIX) : trsm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyrk.$(SUFFIX) : syrk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsyr2k.$(SUFFIX) : syr2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemm.$(SUFFIX) : hemm.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zherk.$(SUFFIX) : herk.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2k.$(SUFFIX) : her2k.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher.$(SUFFIX) : her.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher.$(SUFFIX) : her.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cher2.$(SUFFIX) : her2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zher2.$(SUFFIX) : her2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -strmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -stpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztpmv.$(SUFFIX) : tpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -stpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztpsv.$(SUFFIX) : tpsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zger.$(SUFFIX) : ger.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ssymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zsymv.$(SUFFIX) : symv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgeev.$(SUFFIX) : geev.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgetri.$(SUFFIX) : getri.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -spotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zpotrf.$(SUFFIX) : potrf.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhemv.$(SUFFIX) : hemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot-intel.$(SUFFIX) : zdot-intel.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -sgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgesv.$(SUFFIX) : gesv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -csrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - - -cgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemm3m.$(SUFFIX) : gemm3m.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - - -snrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -scnrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -dznrm2.$(SUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - - -smallscaling: smallscaling.c ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread - -clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling - -include $(TOPDIR)/Makefile.tail +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +# ACML standard +#ACML=/opt/acml5.3.1/gfortran64_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML custom +#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML 6.1 custom +ACML=/home/saar/acml6.1/gfortran64_mp/lib +LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + + +# Atlas Ubuntu +#ATLAS=/usr/lib/atlas-base +#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Atlas RHEL and Fedora +ATLAS=/usr/lib64/atlas +LIBATLAS = -fopenmp $(ATLAS)/liblapack.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm + +# Intel standard +# MKL=/opt/intel/mkl/lib/intel64 +# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Intel custom +MKL=/home/saar/intel_mkl +LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm + +# Apple vecLib +LIBVECLIB = -framework Accelerate + +ESSL=/opt/ibm/lib +#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a + +ifneq ($(NO_LAPACK), 1) +GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + csymv.goto zsymv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto +else +GOTO_LAPACK_TARGETS= +endif + +ifeq ($(BUILD_BFLOAT16),1) +GOTO_HALF_TARGETS=sbgemm.goto +else +GOTO_HALF_TARGETS= +endif + +ifeq ($(OSNAME), WINNT) + +goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ + scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ + sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto csymv.goto zsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ + sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ + sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ + spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sspmv.atlas dspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ + srotm.mkl drotm.mkl \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + +else + +goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ + strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ + strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ + ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ + ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ + sger.goto dger.goto cger.goto zger.goto \ + sdot.goto ddot.goto cdot.goto zdot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ + saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ + scopy.goto dcopy.goto ccopy.goto zcopy.goto \ + sswap.goto dswap.goto cswap.goto zswap.goto \ + sscal.goto dscal.goto cscal.goto zscal.goto \ + sasum.goto dasum.goto casum.goto zasum.goto \ + ssymv.goto dsymv.goto \ + chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ + chemm.goto zhemm.goto \ + cherk.goto zherk.goto \ + cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ + sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + ismax.goto idmax.goto \ + isamin.goto idamin.goto icamin.goto izamin.goto \ + ismin.goto idmin.goto \ + samax.goto damax.goto camax.goto zamax.goto \ + smax.goto dmax.goto \ + samin.goto damin.goto camin.goto zamin.goto \ + smin.goto dmin.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) + +acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ + scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ + sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ + strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ + strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ + ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ + ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ + sger.acml dger.acml cger.acml zger.acml \ + sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ + saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ + scopy.acml dcopy.acml ccopy.acml zcopy.acml \ + sswap.acml dswap.acml cswap.acml zswap.acml \ + sscal.acml dscal.acml cscal.acml zscal.acml \ + sasum.acml dasum.acml casum.acml zasum.acml \ + ssymv.acml dsymv.acml csymv.acml zsymv.acml \ + chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ + chemm.acml zhemm.acml \ + cherk.acml zherk.acml \ + cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ + sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ + sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ + sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ + spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + +atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ + scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ + sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ + strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ + strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ + ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ + ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ + sger.atlas dger.atlas cger.atlas zger.atlas\ + sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ + scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ + sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ + sscal.atlas dscal.atlas cscal.atlas zscal.atlas \ + sasum.atlas dasum.atlas casum.atlas zasum.atlas \ + ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ + chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ + chemm.acml zhemm.acml \ + chemm.atlas zhemm.atlas \ + cherk.atlas zherk.atlas \ + cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ + sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ + sspmv.atlas dspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ + sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ + sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ + spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + +mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ + scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ + sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ + strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ + strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ + ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ + ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ + sger.mkl dger.mkl cger.mkl zger.mkl \ + sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ + saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ + scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ + sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ + sscal.mkl dscal.mkl cscal.mkl zscal.mkl \ + sasum.mkl dasum.mkl casum.mkl zasum.mkl \ + ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ + chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ + chemm.mkl zhemm.mkl \ + cherk.mkl zherk.mkl \ + cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ + sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ + sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ + sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ + spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + + + + +endif + +essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ + scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ + strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl + +veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ + scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ + sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ + strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ + strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ + ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ + ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ + ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ + sger.veclib dger.veclib cger.veclib zger.veclib \ + sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ + srotm.veclib drotm.veclib \ + saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ + scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ + sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ + sscal.veclib dscal.veclib cscal.veclib zscal.veclib \ + sasum.veclib dasum.veclib casum.veclib zasum.veclib \ + ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ + chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ + chemm.veclib zhemm.veclib \ + cherk.veclib zherk.veclib \ + cher2k.veclib zher2k.veclib \ + cher.veclib zher.veclib \ + cher2.veclib zher2.veclib \ + sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ + stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ + sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ + sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ + sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ + spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib + +goto_3m :: cgemm3m.goto zgemm3m.goto + +mkl_3m :: cgemm3m.mkl zgemm3m.mkl + +all :: goto mkl atlas acml veclib + +exe : + @./Make_exe.sh + +##################################### Slinpack #################################################### +slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +slinpack.acml : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.atlas : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.mkl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.veclib : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.essl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dlinpack #################################################### +dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dlinpack.acml : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.atlas : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.mkl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.veclib : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.essl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Clinpack #################################################### + +clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +clinpack.acml : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.atlas : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.mkl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.veclib : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.essl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zlinpack #################################################### + +zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zlinpack.acml : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.atlas : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.mkl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.veclib : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.essl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scholesky ################################################### + +scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scholesky.acml : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.atlas : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.mkl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.veclib : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.essl : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcholesky ################################################### + +dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcholesky.acml : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.atlas : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.mkl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.veclib : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.essl : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccholesky ################################################### + +ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccholesky.acml : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.atlas : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.mkl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.veclib : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.essl : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Zcholesky ################################################### + +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcholesky.acml : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.atlas : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.mkl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.veclib : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.essl : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemm #################################################### +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + +sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemm.acml : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.atlas : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.mkl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.veclib : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemm.essl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemm #################################################### +dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemm.acml : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.atlas : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.mkl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.veclib : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemm.essl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemm #################################################### + +cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm.acml : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.atlas : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.mkl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.veclib : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm.essl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm #################################################### + +zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm.acml : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.atlas : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.mkl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.veclib : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm.essl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymm #################################################### +ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymm.acml : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.atlas : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.mkl : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymm.veclib : ssymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymm #################################################### +dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymm.acml : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.atlas : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.mkl : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymm.veclib : dsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymm #################################################### + +csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymm.acml : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.atlas : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.mkl : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymm.veclib : csymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsymm #################################################### + +zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymm.acml : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.atlas : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.mkl : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymm.veclib : zsymm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmm #################################################### +strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmm.acml : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.atlas : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.mkl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.veclib : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmm.essl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmm #################################################### +dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmm.acml : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.atlas : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.mkl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.veclib : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmm.essl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmm #################################################### + +ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmm.acml : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.atlas : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.mkl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.veclib : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmm.essl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmm #################################################### + +ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmm.acml : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.atlas : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.mkl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.veclib : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmm.essl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsm #################################################### +strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsm.acml : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.atlas : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.mkl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.veclib : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsm.essl : strsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsm #################################################### +dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsm.acml : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.atlas : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.mkl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.veclib : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsm.essl : dtrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsm #################################################### + +ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsm.acml : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.atlas : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.mkl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.veclib : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsm.essl : ctrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsm #################################################### + +ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsm.acml : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.atlas : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.mkl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.veclib : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsm.essl : ztrsm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyrk #################################################### +ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyrk.acml : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.atlas : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.mkl : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyrk.veclib : ssyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyrk #################################################### +dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyrk.acml : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.atlas : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.mkl : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyrk.veclib : dsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyrk #################################################### + +csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyrk.acml : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.atlas : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.mkl : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyrk.veclib : csyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyrk #################################################### + +zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyrk.acml : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.atlas : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.mkl : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyrk.veclib : zsyrk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssyr2k #################################################### +ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2k.acml : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.atlas : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.mkl : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2k.veclib : ssyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsyr2k #################################################### +dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2k.acml : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.atlas : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.mkl : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2k.veclib : dsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csyr2k #################################################### + +csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csyr2k.acml : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.atlas : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.mkl : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csyr2k.veclib : csyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zsyr2k #################################################### + +zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsyr2k.acml : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.atlas : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.mkl : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsyr2k.veclib : zsyr2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemm #################################################### + +chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemm.acml : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.atlas : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.mkl : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemm.veclib : chemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemm #################################################### + +zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemm.acml : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.atlas : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.mkl : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemm.veclib : zhemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cherk #################################################### + +cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cherk.acml : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.atlas : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.mkl : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cherk.veclib : cherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zherk #################################################### + +zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zherk.acml : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.atlas : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.mkl : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zherk.veclib : zherk.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2k #################################################### + +cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2k.acml : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.atlas : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.mkl : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2k.veclib : cher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2k #################################################### + +zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2k.acml : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.atlas : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.mkl : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2k.veclib : zher2k.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher #################################################### + +cher.goto : cher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher.acml : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.atlas : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.mkl : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.veclib : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher #################################################### + +zher.goto : zher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher.acml : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.atlas : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.mkl : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.veclib : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2 #################################################### + +cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2.acml : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.atlas : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.mkl : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.veclib : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2 #################################################### + +zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2.acml : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.atlas : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.mkl : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.veclib : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgemv.acml : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.atlas : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.mkl : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgemv.veclib : sgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgemv.acml : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.atlas : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.mkl : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgemv.veclib : dgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemv.acml : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.atlas : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.mkl : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemv.veclib : cgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemv.acml : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.atlas : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.mkl : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemv.veclib : zgemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspmv #################################################### +sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspmv.atlas : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspmv #################################################### +dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspmv.atlas : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Stpmv #################################################### +stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpmv.acml : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.atlas : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.mkl : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.veclib : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpmv #################################################### +dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpmv.acml : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.atlas : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.mkl : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.veclib : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpmv #################################################### + +ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpmv.acml : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.atlas : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.mkl : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.veclib : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpmv #################################################### + +ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpmv.acml : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.atlas : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.mkl : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.veclib : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpsv #################################################### +stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpsv.acml : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.atlas : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.mkl : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.veclib : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpsv #################################################### +dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpsv.acml : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.atlas : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.mkl : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.veclib : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpsv #################################################### + +ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpsv.acml : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.atlas : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.mkl : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.veclib : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpsv #################################################### + +ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpsv.acml : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.atlas : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.mkl : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.veclib : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sger #################################################### +sger.goto : sger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sger.acml : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.atlas : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.mkl : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sger.veclib : sger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dger #################################################### +dger.goto : dger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dger.acml : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.atlas : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.mkl : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dger.veclib : dger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cger #################################################### +cger.goto : cger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cger.acml : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.atlas : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.mkl : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cger.veclib : cger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zger #################################################### +zger.goto : zger.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zger.acml : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.atlas : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.mkl : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zger.veclib : zger.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ssymv #################################################### +ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssymv.acml : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.atlas : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.mkl : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssymv.veclib : ssymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsymv.acml : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.atlas : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.mkl : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsymv.veclib : dsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Csymv #################################################### +csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csymv.acml : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.atlas : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.mkl : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csymv.veclib : csymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dsymv #################################################### +zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zsymv.acml : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.atlas : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.mkl : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zsymv.veclib : zsymv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgeev #################################################### +sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgeev.acml : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.atlas : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.mkl : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgeev.veclib : sgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgeev #################################################### +dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgeev.acml : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.atlas : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.mkl : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgeev.veclib : dgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgeev #################################################### + +cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgeev.acml : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.atlas : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.mkl : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgeev.veclib : cgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgeev #################################################### + +zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgeev.acml : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.atlas : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.mkl : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgeev.veclib : zgeev.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sgetri #################################################### +sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgetri.acml : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.atlas : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.mkl : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgetri.veclib : sgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgetri #################################################### +dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgetri.acml : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.atlas : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.mkl : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgetri.veclib : dgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgetri #################################################### + +cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgetri.acml : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.atlas : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.mkl : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgetri.veclib : cgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgetri #################################################### + +zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) + $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgetri.acml : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.atlas : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.mkl : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgetri.veclib : zgetri.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Spotrf #################################################### +spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +spotrf.acml : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.atlas : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.mkl : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +spotrf.veclib : spotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dpotrf #################################################### +dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dpotrf.acml : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.atlas : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.mkl : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dpotrf.veclib : dpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cpotrf #################################################### + +cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cpotrf.acml : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.atlas : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.mkl : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cpotrf.veclib : cpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zpotrf #################################################### + +zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zpotrf.acml : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.atlas : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.mkl : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zpotrf.veclib : zpotrf.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Chemv #################################################### + +chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chemv.acml : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.atlas : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.mkl : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chemv.veclib : chemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zhemv #################################################### + +zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhemv.acml : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.atlas : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.mkl : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhemv.veclib : zhemv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### + +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sdot.acml : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.atlas : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.mkl : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sdot.veclib : sdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ddot.acml : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.atlas : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.mkl : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ddot.veclib : ddot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cdot.acml : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.atlas : cdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.mkl : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cdot.veclib : cdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdot.acml : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.atlas : zdot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.mkl : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdot.veclib : zdot-intel.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srot.acml : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.atlas : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.mkl : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.veclib : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drot.acml : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.atlas : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.mkl : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.veclib : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpy.acml : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.atlas : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.mkl : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpy.veclib : saxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpy.acml : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.atlas : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.mkl : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpy.veclib : daxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpy.acml : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.atlas : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.mkl : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpy.veclib : caxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpy.acml : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.atlas : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.mkl : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpy.veclib : zaxpy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### + +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scopy.acml : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.atlas : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.mkl : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scopy.veclib : scopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dcopy.acml : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.atlas : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.mkl : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcopy.veclib : dcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ccopy.acml : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.atlas : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.mkl : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccopy.veclib : ccopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zcopy.acml : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.atlas : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.mkl : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcopy.veclib : zcopy.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sscal.acml : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.atlas : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.mkl : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sscal.veclib : sscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dscal.acml : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.atlas : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.mkl : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dscal.veclib : dscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cscal.acml : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.atlas : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.mkl : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cscal.veclib : cscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zscal.acml : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.atlas : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.mkl : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zscal.veclib : zscal.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sasum.acml : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.atlas : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.mkl : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sasum.veclib : sasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dasum.acml : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.atlas : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.mkl : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dasum.veclib : dasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +casum.acml : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.atlas : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.mkl : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +casum.veclib : casum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zasum.acml : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.atlas : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.mkl : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zasum.veclib : zasum.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sswap.acml : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.atlas : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.mkl : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sswap.veclib : sswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dswap.acml : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.atlas : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.mkl : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dswap.veclib : dswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cswap.acml : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.atlas : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.mkl : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cswap.veclib : cswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zswap.acml : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.atlas : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.mkl : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zswap.veclib : zswap.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Sgesv #################################################### +sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sgesv.acml : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.atlas : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.mkl : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sgesv.veclib : sgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dgesv #################################################### +dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dgesv.acml : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.atlas : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.mkl : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dgesv.veclib : dgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cgesv #################################################### + +cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgesv.acml : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.atlas : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.mkl : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgesv.veclib : cgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgesv #################################################### + +zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgesv.acml : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.atlas : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.mkl : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgesv.veclib : zgesv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +##################################### Cgemm3m #################################################### + +cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cgemm3m.mkl : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cgemm3m.veclib : cgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zgemm3m #################################################### + +zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zgemm3m.mkl : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zgemm3m.veclib : zgemm3m.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMAX ############################################## +camax.goto : camax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMAX ############################################## +zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + +################################################################################################### + +slinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +clinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + +sgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymm.$(SUFFIX) : symm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmm.$(SUFFIX) : trmm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsm.$(SUFFIX) : trsm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyrk.$(SUFFIX) : syrk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsyr2k.$(SUFFIX) : syr2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemm.$(SUFFIX) : hemm.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zherk.$(SUFFIX) : herk.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2k.$(SUFFIX) : her2k.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +strmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +stpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +stpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zger.$(SUFFIX) : ger.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ssymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zsymv.$(SUFFIX) : symv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgeev.$(SUFFIX) : geev.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgetri.$(SUFFIX) : getri.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +spotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zpotrf.$(SUFFIX) : potrf.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhemv.$(SUFFIX) : hemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot-intel.$(SUFFIX) : zdot-intel.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +sgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgesv.$(SUFFIX) : gesv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + + +cgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemm3m.$(SUFFIX) : gemm3m.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread + +clean :: + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling + +include $(TOPDIR)/Makefile.tail diff --git a/benchmark/amax.c b/benchmark/amax.c index 29310dd716..446ba4c077 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -1,133 +1,133 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef AMAX - -#ifdef COMPLEX -#ifdef DOUBLE -#define AMAX BLASFUNC(dzamax) -#else -#define AMAX BLASFUNC(scamax) -#endif -#else -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif -#endif - -int main(int argc, char *argv[]) -{ - - FLOAT *x; - blasint m, i; - blasint inc_x = 1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1, timeg; - - argc--; - argv++; - - if (argc > 0) - { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) - { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) - { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) - { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) - { - - timeg = 0; - fprintf(stderr, " %6d : ", (int)m); - - for (l = 0; l < loops; l++) - { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) - { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - begin(); - AMAX(&m, x, &inc_x); - end(); - timeg += getsec(); - } - - timeg /= loops; - - fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef AMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMAX BLASFUNC(dzamax) +#else +#define AMAX BLASFUNC(scamax) +#endif +#else +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x; + blasint m, i; + blasint inc_x = 1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) + { + + timeg = 0; + fprintf(stderr, " %6d : ", (int)m); + + for (l = 0; l < loops; l++) + { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + begin(); + AMAX(&m, x, &inc_x); + end(); + timeg += getsec(); + } + + timeg /= loops; + + fprintf(stderr, + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/amin.c b/benchmark/amin.c index 54a1d266a3..44f15a7f8e 100644 --- a/benchmark/amin.c +++ b/benchmark/amin.c @@ -1,137 +1,137 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef AMIN - -#ifdef COMPLEX -#ifdef DOUBLE -#define AMIN BLASFUNC(dzamin) -#else -#define AMIN BLASFUNC(scamin) -#endif -#else -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif -#endif - -int main(int argc, char *argv[]) -{ - - FLOAT *x; - blasint m, i; - blasint inc_x = 1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1, timeg; - - argc--; - argv++; - - if (argc > 0) - { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) - { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) - { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) - { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) - { - - timeg = 0; - - fprintf(stderr, " %6d : ", (int)m); - - for (l = 0; l < loops; l++) - { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) - { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - begin(); - - AMIN(&m, x, &inc_x); - - end(); - - timeg += getsec(); - } - - timeg /= loops; - - fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef AMIN + +#ifdef COMPLEX +#ifdef DOUBLE +#define AMIN BLASFUNC(dzamin) +#else +#define AMIN BLASFUNC(scamin) +#endif +#else +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x; + blasint m, i; + blasint inc_x = 1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) + { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + + for (l = 0; l < loops; l++) + { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + begin(); + + AMIN(&m, x, &inc_x); + + end(); + + timeg += getsec(); + } + + timeg /= loops; + + fprintf(stderr, + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/bench.h b/benchmark/bench.h index c03d72bef0..1dae4d0fd2 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){ #endif +/* Benchmarks should allocate with cacheline (often 64 bytes) alignment + to avoid unreliable results. This technique, storing the allocated + pointer value just before the aligned memory, doesn't require + C11's aligned_alloc for compatibility with older compilers. */ +static void *aligned_alloc_cacheline(size_t n) +{ + void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); + if (p) { + void **newp = (void **) + (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE); + newp[-1] = p; + p = newp; + } + return p; +} +#define malloc aligned_alloc_cacheline +#define free(p) free((p) ? ((void **)(p))[-1] : (p)) + #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; #elif defined(__APPLE__) diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index 35249bdf90..7bf047abd2 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -1,134 +1,134 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef HBMV - -#ifdef DOUBLE -#define HBMV BLASFUNC(zhbmv) -#else -#define HBMV BLASFUNC(chbmv) -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {0.0, 0.0}; - blasint k = 1; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1, inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - if ((p = getenv("OPENBLAS_K"))) k = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", - from, to, step, uplo, k, inc_x, inc_y, loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m, (int)m); - - for(j = 0; j < m; j++) { - for(i = 0; i < m * COMPSIZE; i++) { - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l = 0; l < loops; l++) { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - begin(); - - HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - - end(); - - timeg += getsec(); - - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef HBMV + +#ifdef DOUBLE +#define HBMV BLASFUNC(zhbmv) +#else +#define HBMV BLASFUNC(chbmv) +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {0.0, 0.0}; + blasint k = 1; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_K"))) k = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, uplo, k, inc_x, inc_y, loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + begin(); + + HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); + + end(); + + timeg += getsec(); + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index 907e2adc4e..0dc296cccd 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -1,133 +1,133 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef HPMV - -#ifdef DOUBLE -#define HPMV BLASFUNC(zhpmv) -#else -#define HPMV BLASFUNC(chpmv) -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 1.0}; - char uplo='L'; - blasint m, i, j; - blasint inc_x=1, inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { - fprintf(stderr,"Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m, (int)m); - - for(j = 0; j < m; j++) { - for(i = 0; i < m * COMPSIZE; i++) { - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l = 0; l < loops; l++) { - - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - - begin(); - - HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - - end(); - - time1 = getsec(); - - timeg += time1; - - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); - } - - return 0; -} - -// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef HPMV + +#ifdef DOUBLE +#define HPMV BLASFUNC(zhpmv) +#else +#define HPMV BLASFUNC(chpmv) +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1, inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m, (int)m); + + for(j = 0; j < m; j++) { + for(i = 0; i < m * COMPSIZE; i++) { + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l = 0; l < loops; l++) { + + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + begin(); + + HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); + + end(); + + time1 = getsec(); + + timeg += time1; + + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/iamin.c b/benchmark/iamin.c index a57638ecc3..2384641a53 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -1,120 +1,120 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "bench.h" - -#undef IAMIN - -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - double time1,timeg; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6d : ", (int)m); - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l 0) { - from = atol(*argv); - argc--; - argv++; - } - if (argc > 0) { - to = MAX(atol(*argv), from); - argc--; - argv++; - } - if (argc > 0) { - step = atol(*argv); - argc--; - argv++; - } - - if ((p = getenv("OPENBLAS_LOOPS"))) - loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) - inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) - inc_y = atoi(p); - - fprintf( - stderr, - "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", - from, to, step, inc_x, inc_y, loops); - - if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == - NULL) { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - - if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == - NULL) { - fprintf(stderr, "Out of Memory!!\n"); - exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for (m = from; m <= to; m += step) { - - timeg = 0; - - fprintf(stderr, " %6d : ", (int)m); - for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { - x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { - y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; - } - - for (l = 0; l < loops; l++) { - begin(); - - ROTM(&m, x, &inc_x, y, &inc_y, param); - - end(); - - time1 = getsec(); - - timeg += time1; - } - - timeg /= loops; - - fprintf(stderr, " %10.2f MFlops %10.6f sec\n", - COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); - } - - return 0; -} +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef ROTM + +#ifdef DOUBLE +#define ROTM BLASFUNC(drotm) +#else +#define ROTM BLASFUNC(srotm) +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x = 1, inc_y = 1; + FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + + double time1, timeg; + + argc--; + argv++; + + if (argc > 0) { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) { + step = atol(*argv); + argc--; + argv++; + } + + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) + inc_y = atoi(p); + + fprintf( + stderr, + "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", + from, to, step, inc_x, inc_y, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + + if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == + NULL) { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for (m = from; m <= to; m += step) { + + timeg = 0; + + fprintf(stderr, " %6d : ", (int)m); + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { + y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + begin(); + + ROTM(&m, x, &inc_x, y, &inc_y, param); + + end(); + + time1 = getsec(); + + timeg += time1; + } + + timeg /= loops; + + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); + } + + return 0; +} diff --git a/benchmark/scal.c b/benchmark/scal.c index 8de6cfd04b..79bcb6729a 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main(int argc, char *argv[]){ - FLOAT *x, *y; + FLOAT *x; FLOAT alpha[2] = { 2.0, 2.0 }; blasint m, i; blasint inc_x=1,inc_y=1; @@ -74,10 +74,6 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - #ifdef __linux srandom(getpid()); #endif @@ -91,30 +87,20 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); + for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + begin(); for (l=0; l 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - - fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef __linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - - fprintf(stderr, " %6dx%d : ", (int)m,(int)m); - - for(j = 0; j < m; j++){ - for(i = 0; i < m * COMPSIZE; i++){ - a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - - for (l=0; l 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l> "$tmpf" args="$msa_flags -o $tmpf.o $tmpf" - have_msa=1 { $compiler_name $flags $args >/dev/null 2>&1 } || { - have_msa=0 + no_msa=1 } rm -rf "$tmpd" @@ -240,6 +239,21 @@ if [ "$architecture" = "riscv64" ]; then rm -rf "$tmpd" fi +no_sve=0 +if [ "$architecture" = "arm64" ]; then + tmpd=`mktemp -d` + tmpf="$tmpd/a.c" + printf "#include \n\n int main(void){}\n">> "$tmpf" + args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf" + no_sve=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_sve=1 + } + rm -rf "$tmpd" +fi + c11_atomics=0 case "$data" in *HAVE_C11*) @@ -375,10 +389,8 @@ done printf "CROSS_SUFFIX=%s\n" "$cross_suffix" [ "$cross" -ne 0 ] && printf "CROSS=1\n" printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" - [ "$have_msa" -eq 1 ] && { - printf "HAVE_MSA=1\n" - printf "MSA_FLAGS=%s\n" "$msa_flags" - } + [ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" + [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" @@ -396,7 +408,7 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` [ "$binformat" = "bin32" ] && printf "#define __32BIT__\t1\n" [ "$binformat" = "bin64" ] && printf "#define __64BIT__\t1\n" [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" - [ "$have_msa" -eq 1 ] && printf "#define HAVE_MSA\t1\n" + [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" } >> "$config" diff --git a/cmake/arch.cmake b/cmake/arch.cmake index f4a135e82c..8521f3988b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,9 +44,12 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) + set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2") + endif () if (DYNAMIC_LIST) - set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) + set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () endif () @@ -132,3 +135,8 @@ if (ARM64) set(BINARY_DEFINED 1) endif () +if (${ARCH} STREQUAL "riscv64") + set(NO_BINARY_MODE 1) + set(BINARY_DEFINED 1) +endif () + diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 57e42781d0..83b8d15ab8 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -144,6 +144,21 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS) endif () endif () +if (${CORE} STREQUAL ZEN) + if (HAVE_AVX512VL) + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=znver4") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + endif () + endif () + endif () + endif () +endif () + if (${CORE} STREQUAL A64FX) if (NOT DYNAMIC_ARCH) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) @@ -155,6 +170,39 @@ if (${CORE} STREQUAL A64FX) endif () endif () +if (${CORE} STREQUAL NEOVERSEN2) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL NEOVERSEV1) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL NEOVERSEN1) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 773feca6f0..e615e148ef 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -46,7 +46,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) - set(EXTRALIB "{EXTRALIB} -lgfortran") + set(EXTRALIB "${EXTRALIB} -lgfortran") endif () if (NO_BINARY_MODE) if (MIPS64) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index f8a27f5d41..45dda86864 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,1004 +1,1014 @@ -# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. -if (NOT C_LAPACK) - message (STATUS "fortran lapack") -set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90 - ../INSTALL/ilaver.f xerbla_array.f - ../INSTALL/slamch.f) - -set(SCLAUX - scombssq.f sbdsvdx.f sstevx.f sstein.f - la_constants.f90 - sbdsdc.f - sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f - slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f - slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f - slagts.f slamrg.f slanst.f - slapy2.f slapy3.f slarnv.f - slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f - slarrk.f slarrr.f slaneg.f - slartg.f90 slaruv.f slas2.f slascl.f - slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f - slasd7.f slasd8.f slasda.f slasdq.f slasdt.f - slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f - slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f - ssteqr.f ssterf.f slaisnan.f sisnan.f - slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f - ../INSTALL/second_${TIMER}.f) - -set(DZLAUX - la_constants.f90 - dbdsdc.f - dbdsvdx.f dstevx.f dstein.f - dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f - dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f - dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f - dlagts.f dlamrg.f dlanst.f - dlapy2.f dlapy3.f dlarnv.f - dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f - dlarrk.f dlarrr.f dlaneg.f - dlartg.f90 dlaruv.f dlas2.f dlascl.f - dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f - dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f - dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f - dsteqr.f dsterf.f dlaisnan.f disnan.f - dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f - ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) - -set(SLASRC - sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f - sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f - sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f - sgehd2.f sgehrd.f sgelq2.f sgelqf.f - sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f - sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f - sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f - sgetrf2.f sgetri.f - sggbak.f sggbal.f - sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f - sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f - sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f - sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f - shsein.f shseqr.f slabrd.f slacon.f slacn2.f - slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f - slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f - slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f - slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f - slansy.f slantb.f slantp.f slantr.f slanv2.f - slapll.f slapmt.f - slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f - slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f - slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f - slarrv.f slartv.f - slarz.f slarzb.f slarzt.f slasy2.f - slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f - slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f - sopgtr.f sopmtr.f sorg2l.f sorg2r.f - sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f - sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f - sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f - sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f - spbstf.f spbsv.f spbsvx.f - spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f - sposvx.f spotrf2.f spotri.f spstrf.f spstf2.f - sppcon.f sppequ.f - spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f - spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f - ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f - ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f - sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f - ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f - ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f - ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f - ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f - ssyswapr.f ssytrs.f ssytrs2.f - ssyconv.f ssyconvf.f ssyconvf_rook.f - ssysv_aa.f ssysv_aa_2stage.f ssytrf_aa.f ssytrf_aa_2stage.f ssytrs_aa.f ssytrs_aa_2stage.f - ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f - ssytri_rook.f ssycon_rook.f ssysv_rook.f - ssytf2_rk.f ssytrf_rk.f ssytrs_3.f - ssytri_3.f ssytri_3x.f ssycon_3.f ssysv_rk.f - ssysv_aa.f ssytrf_aa.f ssytrs_aa.f - stbcon.f - stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f - stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f - stptrs.f - strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f - strtrs.f stzrzf.f sstemr.f - slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f - stfttr.f stpttf.f stpttr.f strttf.f strttp.f - sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f - sgeequb.f ssyequb.f spoequb.f sgbequb.f - sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f - sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f - sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f - stpqrt.f stpqrt2.f stpmqrt.f stprfb.f - sgelqt.f sgelqt3.f sgemlqt.f - sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f - sgelq.f slaswlq.f slamswlq.f sgemlq.f - stplqt.f stplqt2.f stpmlqt.f - ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f - ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f - ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f - sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) - -set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f - sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f - sla_syrfsx_extended.f sla_syamv.f sla_syrcond.f sla_syrpvgrw.f - sposvxx.f sporfsx.f sla_porfsx_extended.f sla_porcond.f - sla_porpvgrw.f sgbsvxx.f sgbrfsx.f sla_gbrfsx_extended.f - sla_gbamv.f sla_gbrcond.f sla_gbrpvgrw.f sla_lin_berr.f slarscl2.f - slascl2.f sla_wwaddw.f) - -set(CLASRC - cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f - cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f - cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f - cgehd2.f cgehrd.f cgelq2.f cgelqf.f - cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f - cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f - cgesc2.f cgesdd.f cgesvd.f cgesvdx.f - cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f - cgesvx.f cgetc2.f cgetrf2.f - cgetri.f - cggbak.f cggbal.f - cgges.f cgges3.f cggesx.f cggev.f cggev3.f cggevx.f - cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f - cggsvd3.f cggsvp3.f - cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f - chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f - checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f - chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f - chetf2.f chetrd.f - chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f - chetrs.f chetrs2.f - chetf2_rook.f chetrf_rook.f chetri_rook.f - chetrs_rook.f checon_rook.f chesv_rook.f - chetf2_rk.f chetrf_rk.f chetri_3.f chetri_3x.f - chetrs_3.f checon_3.f chesv_rk.f - chesv_aa.f chesv_aa_2stage.f chetrf_aa.f chetrf_aa_2stage.f chetrs_aa.f chetrs_aa_2stage.f - chgeqz.f chpcon.f chpev.f chpevd.f - chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f - chpsvx.f - chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f - clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f - claed0.f claed7.f claed8.f - claein.f claesy.f claev2.f clags2.f clagtm.f - clahef.f clahef_rook.f clahef_rk.f clahef_aa.f clahqr.f - clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f - clanhb.f clanhe.f - clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f - clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f - claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f - claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f - claqz0.f claqz1.f claqz2.f claqz3.f - claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f - clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f - clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 - clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f - clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f - cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f - cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f - cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f - cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f - cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f - crot.f cspcon.f csprfs.f cspsv.f - cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f - cstegr.f cstein.f csteqr.f csycon.f - csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f - csytri2.f csytri2x.f csyswapr.f - csytrs.f csytrs2.f - csyconv.f csyconvf.f csyconvf_rook.f - csytf2_rook.f csytrf_rook.f csytrs_rook.f - csytri_rook.f csycon_rook.f csysv_rook.f - csytf2_rk.f csytrf_rk.f csytrf_aa.f csytrf_aa_2stage.f csytrs_3.f csytrs_aa.f csytrs_aa_2stage.f - csytri_3.f csytri_3x.f csycon_3.f csysv_rk.f csysv_aa.f csysv_aa_2stage.f - ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f - ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f - ctprfs.f ctptri.f - ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f - ctrsyl.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f - cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f - cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f - cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f - cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f - chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f - ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f - cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f - cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f - cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f - cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f - ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f - cgelqt.f cgelqt3.f cgemlqt.f - cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f - cgelq.f claswlq.f clamswlq.f cgemlq.f - ctplqt.f ctplqt2.f ctpmlqt.f - chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f - cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f - chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f - cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cungtsqr_row.f cunhr_col.f ) - -set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f - cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f - csysvxx.f csyrfsx.f cla_syrfsx_extended.f cla_syamv.f - cla_syrcond_c.f cla_syrcond_x.f cla_syrpvgrw.f - cposvxx.f cporfsx.f cla_porfsx_extended.f - cla_porcond_c.f cla_porcond_x.f cla_porpvgrw.f - cgbsvxx.f cgbrfsx.f cla_gbrfsx_extended.f cla_gbamv.f - cla_gbrcond_c.f cla_gbrcond_x.f cla_gbrpvgrw.f - chesvxx.f cherfsx.f cla_herfsx_extended.f cla_heamv.f - cla_hercond_c.f cla_hercond_x.f cla_herpvgrw.f - cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) - -set(DLASRC - dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f - dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f - dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f - dgehd2.f dgehrd.f dgelq2.f dgelqf.f - dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f - dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f - dgetrf2.f dgetri.f - dggbak.f dggbal.f - dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f - dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f - dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f - dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f - dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f - dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f - dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f - dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f - dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f - dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f - dlapll.f dlapmt.f - dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f - dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f - dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f - dlargv.f dlarrv.f dlartv.f - dlarz.f dlarzb.f dlarzt.f dlasy2.f - dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f - dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f - dopgtr.f dopmtr.f dorg2l.f dorg2r.f - dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f - dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f - dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f - dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f - dpbstf.f dpbsv.f dpbsvx.f - dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f - dposvx.f dpotrf2.f dpotri.f dpotrs.f dpstrf.f dpstf2.f - dppcon.f dppequ.f - dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f - dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f - dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f - dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f - dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f - dsycon.f dsyev.f dsyevd.f dsyevr.f - dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f - dsysv.f dsysvx.f - dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f - dsytri2.f dsytri2x.f dsyswapr.f - dsyconv.f dsyconvf.f dsyconvf_rook.f - dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f - dsytri_rook.f dsycon_rook.f dsysv_rook.f - dsytf2_rk.f dsytrf_rk.f dsytrs_3.f - dsytri_3.f dsytri_3x.f dsycon_3.f dsysv_rk.f - dsysv_aa.f dsysv_aa_2stage.f dsytrf_aa.f dsytrf_aa_2stage.f dsytrs_aa.f dsytrs_aa_2stage.f - dtbcon.f - dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f - dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f - dtptrs.f - dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrtrs.f dtzrzf.f dstemr.f - dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f - dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f - dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f - dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f - dgeequb.f dsyequb.f dpoequb.f dgbequb.f - dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f - dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f - dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f - dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f - dgelqt.f dgelqt3.f dgemlqt.f - dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f - dgelq.f dlaswlq.f dlamswlq.f dgemlq.f - dtplqt.f dtplqt2.f dtpmlqt.f - dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f - dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f - dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f - dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) - -set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f - dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f - dla_syrfsx_extended.f dla_syamv.f dla_syrcond.f dla_syrpvgrw.f - dposvxx.f dporfsx.f dla_porfsx_extended.f dla_porcond.f - dla_porpvgrw.f dgbsvxx.f dgbrfsx.f dla_gbrfsx_extended.f - dla_gbamv.f dla_gbrcond.f dla_gbrpvgrw.f dla_lin_berr.f dlarscl2.f - dlascl2.f dla_wwaddw.f) - -set(ZLASRC - zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f - zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f - zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f - zgehd2.f zgehrd.f zgelq2.f zgelqf.f - zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f - zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f - zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f - zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f - zgetc2.f zgetrf2.f - zgetri.f - zggbak.f zggbal.f - zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f - zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f - zggsvd3.f zggsvp3.f - zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f - zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f - zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f - zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f - zhetf2.f zhetrd.f - zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f - zhetrs.f zhetrs2.f - zhetf2_rook.f zhetrf_rook.f zhetri_rook.f - zhetrs_rook.f zhecon_rook.f zhesv_rook.f - zhetf2_rk.f zhetrf_rk.f zhetri_3.f zhetri_3x.f - zhetrs_3.f zhecon_3.f zhesv_rk.f - zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f - zhgeqz.f zhpcon.f zhpev.f zhpevd.f - zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f - zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f - zhpsvx.f - zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f - zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f - zlaed0.f zlaed7.f zlaed8.f - zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f - zlahef.f zlahef_rook.f zlahef_rk.f zlahef_aa.f zlahqr.f - zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f - zlangt.f zlanhb.f - zlanhe.f - zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f - zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f - zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f - zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f - zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f - zlarfg.f zlarfgp.f zlarft.f - zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f - zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f - zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f - zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f - zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f - zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f - zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f - zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f - zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f - zrot.f zspcon.f zsprfs.f zspsv.f - zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f - zstegr.f zstein.f zsteqr.f zsycon.f - zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f - zsytri2.f zsytri2x.f zsyswapr.f - zsytrs.f zsytrs2.f - zsyconv.f zsyconvf.f zsyconvf_rook.f - zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytrs_aa.f zsytrs_aa_2stage.f - zsytri_rook.f zsycon_rook.f zsysv_rook.f - zsytf2_rk.f zsytrf_rk.f zsytrf_aa.f zsytrf_aa_2stage.f zsytrs_3.f - zsytri_3.f zsytri_3x.f zsycon_3.f zsysv_rk.f zsysv_aa.f zsysv_aa_2stage.f - ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f - ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f - ztprfs.f ztptri.f - ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f - ztrsyl.f ztrtrs.f ztzrzf.f zung2l.f - zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f - zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f - zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f - zunmtr.f zupgtr.f - zupmtr.f izmax1.f dzsum1.f zstemr.f - zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f - zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f - ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f - zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f - zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f - zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f - zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f - ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f - ztplqt.f ztplqt2.f ztpmlqt.f - zgelqt.f zgelqt3.f zgemlqt.f - zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f - zgelq.f zlaswlq.f zlamswlq.f zgemlq.f - zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f - zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f - zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f - zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zungtsqr_row.f zunhr_col.f) - -set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f - zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f - zla_syrfsx_extended.f zla_syamv.f zla_syrcond_c.f zla_syrcond_x.f - zla_syrpvgrw.f zposvxx.f zporfsx.f zla_porfsx_extended.f - zla_porcond_c.f zla_porcond_x.f zla_porpvgrw.f zgbsvxx.f zgbrfsx.f - zla_gbrfsx_extended.f zla_gbamv.f zla_gbrcond_c.f zla_gbrcond_x.f - zla_gbrpvgrw.f zhesvxx.f zherfsx.f zla_herfsx_extended.f - zla_heamv.f zla_hercond_c.f zla_hercond_x.f zla_herpvgrw.f - zla_lin_berr.f zlarscl2.f zlascl2.f zla_wwaddw.f) - - -if(USE_XBLAS) - set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) -endif() - -list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f - DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f - DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) -list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f - DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f - DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f) -list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f - DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f - DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f) -list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f - DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f - DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) -message(STATUS "Building deprecated routines") - -set(DSLASRC spotrs.f) - -set(ZCLASRC cpotrs.f) - -set(SCATGEN slatm1.f slaran.f slarnd.f) - -set(SMATGEN slatms.f slatme.f slatmr.f slatmt.f - slagge.f slagsy.f slakf2.f slarge.f slaror.f slarot.f slatm2.f - slatm3.f slatm5.f slatm6.f slatm7.f slahilb.f) - -set(CMATGEN clatms.f clatme.f clatmr.f clatmt.f - clagge.f claghe.f clagsy.f clakf2.f clarge.f claror.f clarot.f - clatm1.f clarnd.f clatm2.f clatm3.f clatm5.f clatm6.f clahilb.f slatm7.f) - -set(DZATGEN dlatm1.f dlaran.f dlarnd.f) - -set(DMATGEN dlatms.f dlatme.f dlatmr.f dlatmt.f - dlagge.f dlagsy.f dlakf2.f dlarge.f dlaror.f dlarot.f dlatm2.f - dlatm3.f dlatm5.f dlatm6.f dlatm7.f dlahilb.f) - -set(ZMATGEN zlatms.f zlatme.f zlatmr.f zlatmt.f - zlagge.f zlaghe.f zlagsy.f zlakf2.f zlarge.f zlaror.f zlarot.f - zlatm1.f zlarnd.f zlatm2.f zlatm3.f zlatm5.f zlatm6.f zlahilb.f dlatm7.f) - -if(BUILD_SINGLE) - set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) - set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision") -endif() -if(BUILD_DOUBLE) - set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) - set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) - message(STATUS "Building Double Precision") -endif() -if(BUILD_COMPLEX) - set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision Complex") -endif() -if(BUILD_COMPLEX16) - set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) -# for zlange/zlanhe - if (NOT BUILD_DOUBLE) - set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) - endif () - message(STATUS "Building Double Precision Complex") -endif() - -else () - - message (STATUS "c lapack") -set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c - ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c - ../INSTALL/ilaver.c xerbla_array.c - ../INSTALL/slamch.c) - -set(SCLAUX - scombssq.c sbdsvdx.c sstevx.c sstein.c - sbdsdc.c - sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c - slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c - slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c - slagts.c slamrg.c slanst.c - slapy2.c slapy3.c slarnv.c - slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c - slarrk.c slarrr.c slaneg.c - slartg.c slaruv.c slas2.c slascl.c - slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c - slasd7.c slasd8.c slasda.c slasdq.c slasdt.c - slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c - slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c - ssteqr.c ssterf.c slaisnan.c sisnan.c - slartgp.c slartgs.c - ../INSTALL/second_${TIMER}.c) - -set(DZLAUX - dbdsdc.c - dbdsvdx.c dstevx.c dstein.c - dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c - dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c - dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c - dlagts.c dlamrg.c dlanst.c - dlapy2.c dlapy3.c dlarnv.c - dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c - dlarrk.c dlarrr.c dlaneg.c - dlartg.c dlaruv.c dlas2.c dlascl.c - dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c - dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c - dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c - dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c - dsteqr.c dsterf.c dlaisnan.c disnan.c - dlartgp.c dlartgs.c - ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c) - -set(SLASRC - sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c - sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c - sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c - sgehd2.c sgehrd.c sgelq2.c sgelqf.c - sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c - sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c - sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c - sgetrf2.c sgetri.c - sggbak.c sggbal.c - sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c - sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c - sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c - sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c - shsein.c shseqr.c slabrd.c slacon.c slacn2.c - slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c - slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c - slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c - slansy.c slantb.c slantp.c slantr.c slanv2.c - slapll.c slapmt.c - slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c - slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c - slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c - slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c - slarrv.c slartv.c - slarz.c slarzb.c slarzt.c slasy2.c - slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c - slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c - sopgtr.c sopmtr.c sorg2l.c sorg2r.c - sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c - sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c - sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c - sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c - spbstf.c spbsv.c spbsvx.c - spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c - sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c - sppcon.c sppequ.c - spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c - spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c - ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c - ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c - sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c - ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c - ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c - ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c - ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c - ssyswapr.c ssytrs.c ssytrs2.c - ssyconv.c ssyconvf.c ssyconvf_rook.c - ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c - ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c - ssytri_rook.c ssycon_rook.c ssysv_rook.c - ssytf2_rk.c ssytrf_rk.c ssytrs_3.c - ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c - ssysv_aa.c ssytrf_aa.c ssytrs_aa.c - stbcon.c - stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c - stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c - stptrs.c - strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c - strtrs.c stzrzf.c sstemr.c - slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c - stfttr.c stpttf.c stpttr.c strttf.c strttp.c - sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c - sgeequb.c ssyequb.c spoequb.c sgbequb.c - sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c - sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c - sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c - stpqrt.c stpqrt2.c stpmqrt.c stprfb.c - sgelqt.c sgelqt3.c sgemlqt.c - sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c - sgelq.c slaswlq.c slamswlq.c sgemlq.c - stplqt.c stplqt2.c stpmlqt.c - ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c - ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c - ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c - sgesvdq.c slaorhr_col_getrfnp.c - slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c ) - -set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c - sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c - sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c - sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c - sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c - sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c - slascl2.c sla_wwaddw.c) - -set(CLASRC - cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c - cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c - cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c - cgehd2.c cgehrd.c cgelq2.c cgelqf.c - cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c - cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c - cgesc2.c cgesdd.c cgesvd.c cgesvdx.c - cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c - cgesvx.c cgetc2.c cgetrf2.c - cgetri.c - cggbak.c cggbal.c - cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c - cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c - cggsvd3.c cggsvp3.c - cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c - chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c - checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c - chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c - chetf2.c chetrd.c - chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c - chetrs.c chetrs2.c - chetf2_rook.c chetrf_rook.c chetri_rook.c - chetrs_rook.c checon_rook.c chesv_rook.c - chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c - chetrs_3.c checon_3.c chesv_rk.c - chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c - chgeqz.c chpcon.c chpev.c chpevd.c - chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c - chpsvx.c - chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c - clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c - claed0.c claed7.c claed8.c - claein.c claesy.c claev2.c clags2.c clagtm.c - clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c - clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c - clanhb.c clanhe.c - clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c - clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c - claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c - claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c - claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c - clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c - clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c - clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c - clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c - clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c - cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c - cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c - cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c - cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c - cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c - crot.c cspcon.c csprfs.c cspsv.c - cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c - cstegr.c cstein.c csteqr.c csycon.c - csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c - csytri2.c csytri2x.c csyswapr.c - csytrs.c csytrs2.c - csyconv.c csyconvf.c csyconvf_rook.c - csytf2_rook.c csytrf_rook.c csytrs_rook.c - csytri_rook.c csycon_rook.c csysv_rook.c - csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c - csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c - ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c - ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c - ctprfs.c ctptri.c - ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c - ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c - cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c - cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c - cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c - cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c - chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c - ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c - cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c - cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c - cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c - cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c - ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c - cgelqt.c cgelqt3.c cgemlqt.c - cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c - cgelq.c claswlq.c clamswlq.c cgemlq.c - ctplqt.c ctplqt2.c ctpmlqt.c - chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c - cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c - chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c - cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c - cungtsqr.c cungtsqr_row.c cunhr_col.c ) - -set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c - cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c - csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c - cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c - cposvxx.c cporfsx.c cla_porfsx_extended.c - cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c - cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c - cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c - chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c - cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c - cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c) - -set(DLASRC - dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c - dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c - dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c - dgehd2.c dgehrd.c dgelq2.c dgelqf.c - dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c - dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c - dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c - dgetrf2.c dgetri.c - dggbak.c dggbal.c - dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c - dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c - dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c - dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c - dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c - dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c - dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c - dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c - dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c - dlapll.c dlapmt.c - dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c - dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c - dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c - dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c - dlargv.c dlarrv.c dlartv.c - dlarz.c dlarzb.c dlarzt.c dlasy2.c - dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c - dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c - dopgtr.c dopmtr.c dorg2l.c dorg2r.c - dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c - dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c - dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c - dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c - dpbstf.c dpbsv.c dpbsvx.c - dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c - dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c - dppcon.c dppequ.c - dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c - dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c - dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c - dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c - dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c - dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c - dsycon.c dsyev.c dsyevd.c dsyevr.c - dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c - dsysv.c dsysvx.c - dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c - dsytri2.c dsytri2x.c dsyswapr.c - dsyconv.c dsyconvf.c dsyconvf_rook.c - dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c - dsytri_rook.c dsycon_rook.c dsysv_rook.c - dsytf2_rk.c dsytrf_rk.c dsytrs_3.c - dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c - dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c - dtbcon.c - dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c - dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c - dtptrs.c - dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c - dtrtrs.c dtzrzf.c dstemr.c - dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c - dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c - dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c - dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c - dgeequb.c dsyequb.c dpoequb.c dgbequb.c - dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c - dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c - dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c - dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c - dgelqt.c dgelqt3.c dgemlqt.c - dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c - dgelq.c dlaswlq.c dlamswlq.c dgemlq.c - dtplqt.c dtplqt2.c dtpmlqt.c - dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c - dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c - dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c - dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c - dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c ) - -set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c - dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c - dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c - dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c - dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c - dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c - dlascl2.c dla_wwaddw.c) - -set(ZLASRC - zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c - zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c - zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c - zgehd2.c zgehrd.c zgelq2.c zgelqf.c - zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c - zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c - zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c - zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c - zgetc2.c zgetrf2.c - zgetri.c - zggbak.c zggbal.c - zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c - zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c - zggsvd3.c zggsvp3.c - zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c - zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c - zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c - zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c - zhetf2.c zhetrd.c - zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c - zhetrs.c zhetrs2.c - zhetf2_rook.c zhetrf_rook.c zhetri_rook.c - zhetrs_rook.c zhecon_rook.c zhesv_rook.c - zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c - zhetrs_3.c zhecon_3.c zhesv_rk.c - zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c - zhgeqz.c zhpcon.c zhpev.c zhpevd.c - zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c - zhpsvx.c - zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c - zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c - zlaed0.c zlaed7.c zlaed8.c - zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c - zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c - zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c - zlangt.c zlanhb.c - zlanhe.c - zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c - zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c - zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c - zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c - zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c - zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c - zlarfg.c zlarfgp.c zlarft.c - zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c - zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c - zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c - zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c - zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c - zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c - zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c - zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c - zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c - zrot.c zspcon.c zsprfs.c zspsv.c - zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c - zstegr.c zstein.c zsteqr.c zsycon.c - zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c - zsytri2.c zsytri2x.c zsyswapr.c - zsytrs.c zsytrs2.c - zsyconv.c zsyconvf.c zsyconvf_rook.c - zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c - zsytri_rook.c zsycon_rook.c zsysv_rook.c - zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c - zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c - ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c - ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c - ztprfs.c ztptri.c - ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c - ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c - zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c - zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c - zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c - zunmtr.c zupgtr.c - zupmtr.c izmax1.c dzsum1.c zstemr.c - zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c - zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c - ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c - zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c - zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c - zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c - zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c - ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c - ztplqt.c ztplqt2.c ztpmlqt.c - zgelqt.c zgelqt3.c zgemlqt.c - zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c - zgelq.c zlaswlq.c zlamswlq.c zgemlq.c - zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c - zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c - zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c - zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c - zungtsqr.c zungtsqr_row.c zunhr_col.c) - -set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c - zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c - zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c - zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c - zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c - zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c - zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c - zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c - zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) - - -if(USE_XBLAS) - set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) -endif() - -list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c - DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c - DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) -list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c - DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c - DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) -list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c - DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c - DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) -list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c - DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c - DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) -message(STATUS "Building deprecated routines") - -set(DSLASRC spotrs.c) - -set(ZCLASRC cpotrs.c) - -set(SCATGEN slatm1.c slaran.c slarnd.c) - -set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c - slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c - slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c) - -set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c - clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c - clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c) - -set(DZATGEN dlatm1.c dlaran.c dlarnd.c) - -set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c - dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c - dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c) - -set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c - zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c - zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c) - -if(BUILD_SINGLE) - set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) - set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision") -endif() -if(BUILD_DOUBLE) - set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) - set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) - message(STATUS "Building Double Precision") -endif() -if(BUILD_COMPLEX) - set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Single Precision Complex") -endif() -if(BUILD_COMPLEX16) - set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) - SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) -# for zlange/zlanhe - if (NOT BUILD_DOUBLE) - set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c) - endif () - message(STATUS "Building Double Precision Complex") -endif() - -endif() - -# add lapack-netlib folder to the sources -set(LA_SOURCES "") -foreach (LA_FILE ${LA_REL_SRC}) - list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") -endforeach () -foreach (LA_FILE ${LA_GEN_SRC}) - list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}") -endforeach () - -if (NOT C_LAPACK) - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") -else () - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") -endif () +# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. +if (NOT C_LAPACK) + message (STATUS "fortran lapack") +set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90 + ../INSTALL/ilaver.f xerbla_array.f + ../INSTALL/slamch.f) + +set(SCLAUX + scombssq.f sbdsvdx.f sstevx.f sstein.f + la_constants.f90 + sbdsdc.f + sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f + slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f + slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f + slagts.f slamrg.f slanst.f + slapy2.f slapy3.f slarnv.f + slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f + slarrk.f slarrr.f slaneg.f + slartg.f90 slaruv.f slas2.f slascl.f + slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f + slasd7.f slasd8.f slasda.f slasdq.f slasdt.f + slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f + slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f + ssteqr.f ssterf.f slaisnan.f sisnan.f slarmm.f + slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f + ../INSTALL/second_${TIMER}.f) + +set(DZLAUX + la_constants.f90 + dbdsdc.f + dbdsvdx.f dstevx.f dstein.f + dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f + dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f + dlagts.f dlamrg.f dlanst.f + dlapy2.f dlapy3.f dlarnv.f + dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f + dlarrk.f dlarrr.f dlaneg.f + dlartg.f90 dlaruv.f dlas2.f dlascl.f + dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f + dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f + dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f + dsteqr.f dsterf.f dlaisnan.f disnan.f dlarmm.f + dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f + ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) + +set(SLASRC + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f + sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f + sgehd2.f sgehrd.f sgelq2.f sgelqf.f + sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f + sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f + sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f + sgetrf2.f sgetri.f + sggbak.f sggbal.f + sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f + sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f + sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f + sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f + shsein.f shseqr.f slabrd.f slacon.f slacn2.f + slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f + slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f + slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f + slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f + slansy.f slantb.f slantp.f slantr.f slanv2.f + slapll.f slapmt.f + slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f + slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f + slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f + slarrv.f slartv.f + slarz.f slarzb.f slarzt.f slasy2.f + slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f + slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f + sopgtr.f sopmtr.f sorg2l.f sorg2r.f + sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f + sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f + sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f + sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f + spbstf.f spbsv.f spbsvx.f + spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f + sposvx.f spotrf2.f spotri.f spstrf.f spstf2.f + sppcon.f sppequ.f + spprfs.f sppsv.f sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f + spteqr.f sptrfs.f sptsv.f sptsvx.f spttrs.f sptts2.f srscl.f + ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f + ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f + sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f + ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f + ssyswapr.f ssytrs.f ssytrs2.f + ssyconv.f ssyconvf.f ssyconvf_rook.f + ssysv_aa.f ssysv_aa_2stage.f ssytrf_aa.f ssytrf_aa_2stage.f ssytrs_aa.f ssytrs_aa_2stage.f + ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f + ssytri_rook.f ssycon_rook.f ssysv_rook.f + ssytf2_rk.f ssytrf_rk.f ssytrs_3.f + ssytri_3.f ssytri_3x.f ssycon_3.f ssysv_rk.f + ssysv_aa.f ssytrf_aa.f ssytrs_aa.f + stbcon.f + stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f + stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f + stptrs.f + strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f + strtrs.f stzrzf.f sstemr.f + slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f + stfttr.f stpttf.f stpttr.f strttf.f strttp.f + sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f + sgeequb.f ssyequb.f spoequb.f sgbequb.f + sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f + sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f + sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f + stpqrt.f stpqrt2.f stpmqrt.f stprfb.f + sgelqt.f sgelqt3.f sgemlqt.f + sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f + sgelq.f slaswlq.f slamswlq.f sgemlq.f + stplqt.f stplqt2.f stpmlqt.f + ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f + ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f + ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f + sgesvdq.f slaorhr_col_getrfnp.f + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f + slatrs3.f strsyl3.f sgelst.f) + +set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f + sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f + sla_syrfsx_extended.f sla_syamv.f sla_syrcond.f sla_syrpvgrw.f + sposvxx.f sporfsx.f sla_porfsx_extended.f sla_porcond.f + sla_porpvgrw.f sgbsvxx.f sgbrfsx.f sla_gbrfsx_extended.f + sla_gbamv.f sla_gbrcond.f sla_gbrpvgrw.f sla_lin_berr.f slarscl2.f + slascl2.f sla_wwaddw.f) + +set(CLASRC + cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f + cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f + cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f + cgehd2.f cgehrd.f cgelq2.f cgelqf.f + cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f + cgesc2.f cgesdd.f cgesvd.f cgesvdx.f + cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f + cgesvx.f cgetc2.f cgetrf2.f + cgetri.f + cggbak.f cggbal.f + cgges.f cgges3.f cggesx.f cggev.f cggev3.f cggevx.f + cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f + cggsvd3.f cggsvp3.f + cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f + chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f + checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f + chegv.f chegvd.f chegvx.f cherfs.f chesv.f chesvx.f chetd2.f + chetf2.f chetrd.f + chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f + chetrs.f chetrs2.f + chetf2_rook.f chetrf_rook.f chetri_rook.f + chetrs_rook.f checon_rook.f chesv_rook.f + chetf2_rk.f chetrf_rk.f chetri_3.f chetri_3x.f + chetrs_3.f checon_3.f chesv_rk.f + chesv_aa.f chesv_aa_2stage.f chetrf_aa.f chetrf_aa_2stage.f chetrs_aa.f chetrs_aa_2stage.f + chgeqz.f chpcon.f chpev.f chpevd.f + chpevx.f chpgst.f chpgv.f chpgvd.f chpgvx.f chprfs.f chpsv.f + chpsvx.f + chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f + clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f + claed0.f claed7.f claed8.f + claein.f claesy.f claev2.f clags2.f clagtm.f + clahef.f clahef_rook.f clahef_rk.f clahef_aa.f clahqr.f + clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f + clanhb.f clanhe.f + clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f + clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f + claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f + claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f + claqz0.f claqz1.f claqz2.f claqz3.f + claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f + clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f + clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f + clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 + clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f + clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f + cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f + cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f + cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f + cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f + cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f + crot.f cspcon.f csprfs.f cspsv.f + cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f + cstegr.f cstein.f csteqr.f csycon.f + csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f + csytri2.f csytri2x.f csyswapr.f + csytrs.f csytrs2.f + csyconv.f csyconvf.f csyconvf_rook.f + csytf2_rook.f csytrf_rook.f csytrs_rook.f + csytri_rook.f csycon_rook.f csysv_rook.f + csytf2_rk.f csytrf_rk.f csytrf_aa.f csytrf_aa_2stage.f csytrs_3.f csytrs_aa.f csytrs_aa_2stage.f + csytri_3.f csytri_3x.f csycon_3.f csysv_rk.f csysv_aa.f csysv_aa_2stage.f + ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f + ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f + ctprfs.f ctptri.f + ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f + ctrsyl.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f + cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f + cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f + cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f + cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f + chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f + ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f + cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f + cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f + cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f + cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f + ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f + cgelqt.f cgelqt3.f cgemlqt.f + cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f + cgelq.f claswlq.f clamswlq.f cgemlq.f + ctplqt.f ctplqt2.f ctpmlqt.f + chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f + cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f + chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f + cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f + cungtsqr.f cungtsqr_row.f cunhr_col.f + clatrs3.f ctrsyl3.f cgelst.f) + +set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f + cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f + csysvxx.f csyrfsx.f cla_syrfsx_extended.f cla_syamv.f + cla_syrcond_c.f cla_syrcond_x.f cla_syrpvgrw.f + cposvxx.f cporfsx.f cla_porfsx_extended.f + cla_porcond_c.f cla_porcond_x.f cla_porpvgrw.f + cgbsvxx.f cgbrfsx.f cla_gbrfsx_extended.f cla_gbamv.f + cla_gbrcond_c.f cla_gbrcond_x.f cla_gbrpvgrw.f + chesvxx.f cherfsx.f cla_herfsx_extended.f cla_heamv.f + cla_hercond_c.f cla_hercond_x.f cla_herpvgrw.f + cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) + +set(DLASRC + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f + dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f + dgehd2.f dgehrd.f dgelq2.f dgelqf.f + dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f + dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f + dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f + dgetrf2.f dgetri.f + dggbak.f dggbal.f + dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f + dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f + dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f + dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f + dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f + dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f + dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f + dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f + dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f + dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f + dlapll.f dlapmt.f + dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f + dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f + dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f + dlargv.f dlarrv.f dlartv.f + dlarz.f dlarzb.f dlarzt.f dlasy2.f + dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f + dopgtr.f dopmtr.f dorg2l.f dorg2r.f + dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f + dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f + dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f + dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f + dpbstf.f dpbsv.f dpbsvx.f + dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f + dposvx.f dpotrf2.f dpotri.f dpotrs.f dpstrf.f dpstf2.f + dppcon.f dppequ.f + dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f + dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f + dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f + dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f + dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f + dsycon.f dsyev.f dsyevd.f dsyevr.f + dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f + dsysv.f dsysvx.f + dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f + dsytri2.f dsytri2x.f dsyswapr.f + dsyconv.f dsyconvf.f dsyconvf_rook.f + dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f + dsytri_rook.f dsycon_rook.f dsysv_rook.f + dsytf2_rk.f dsytrf_rk.f dsytrs_3.f + dsytri_3.f dsytri_3x.f dsycon_3.f dsysv_rk.f + dsysv_aa.f dsysv_aa_2stage.f dsytrf_aa.f dsytrf_aa_2stage.f dsytrs_aa.f dsytrs_aa_2stage.f + dtbcon.f + dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f + dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f + dtptrs.f + dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f + dtrtrs.f dtzrzf.f dstemr.f + dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f + dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f + dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f + dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f + dgeequb.f dsyequb.f dpoequb.f dgbequb.f + dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f + dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f + dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f + dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f + dgelqt.f dgelqt3.f dgemlqt.f + dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f + dgelq.f dlaswlq.f dlamswlq.f dgemlq.f + dtplqt.f dtplqt2.f dtpmlqt.f + dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f + dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f + dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f + dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f + dlatrs3.f dtrsyl3.f dgelst.f) + +set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f + dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f + dla_syrfsx_extended.f dla_syamv.f dla_syrcond.f dla_syrpvgrw.f + dposvxx.f dporfsx.f dla_porfsx_extended.f dla_porcond.f + dla_porpvgrw.f dgbsvxx.f dgbrfsx.f dla_gbrfsx_extended.f + dla_gbamv.f dla_gbrcond.f dla_gbrpvgrw.f dla_lin_berr.f dlarscl2.f + dlascl2.f dla_wwaddw.f) + +set(ZLASRC + zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f + zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f + zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f + zgehd2.f zgehrd.f zgelq2.f zgelqf.f + zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f + zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f + zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f + zgetc2.f zgetrf2.f + zgetri.f + zggbak.f zggbal.f + zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f + zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f + zggsvd3.f zggsvp3.f + zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f + zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f + zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f + zhegv.f zhegvd.f zhegvx.f zherfs.f zhesv.f zhesvx.f zhetd2.f + zhetf2.f zhetrd.f + zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f + zhetrs.f zhetrs2.f + zhetf2_rook.f zhetrf_rook.f zhetri_rook.f + zhetrs_rook.f zhecon_rook.f zhesv_rook.f + zhetf2_rk.f zhetrf_rk.f zhetri_3.f zhetri_3x.f + zhetrs_3.f zhecon_3.f zhesv_rk.f + zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f + zhgeqz.f zhpcon.f zhpev.f zhpevd.f + zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f + zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f + zhpsvx.f + zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f + zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f + zlaed0.f zlaed7.f zlaed8.f + zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f + zlahef.f zlahef_rook.f zlahef_rk.f zlahef_aa.f zlahqr.f + zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f + zlangt.f zlanhb.f + zlanhe.f + zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f + zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f + zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f + zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f + zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f + zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f + zlarfg.f zlarfgp.f zlarft.f + zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f + zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f + zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f + zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f + zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f + zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f + zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f + zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f + zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f + zrot.f zspcon.f zsprfs.f zspsv.f + zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f + zstegr.f zstein.f zsteqr.f zsycon.f + zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f + zsytri2.f zsytri2x.f zsyswapr.f + zsytrs.f zsytrs2.f + zsyconv.f zsyconvf.f zsyconvf_rook.f + zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytrs_aa.f zsytrs_aa_2stage.f + zsytri_rook.f zsycon_rook.f zsysv_rook.f + zsytf2_rk.f zsytrf_rk.f zsytrf_aa.f zsytrf_aa_2stage.f zsytrs_3.f + zsytri_3.f zsytri_3x.f zsycon_3.f zsysv_rk.f zsysv_aa.f zsysv_aa_2stage.f + ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f + ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f + ztprfs.f ztptri.f + ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f + ztrsyl.f ztrtrs.f ztzrzf.f zung2l.f + zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f + zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f + zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f + zunmtr.f zupgtr.f + zupmtr.f izmax1.f dzsum1.f zstemr.f + zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f + zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f + ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f + zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f + zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f + zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f + zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f + ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f + ztplqt.f ztplqt2.f ztpmlqt.f + zgelqt.f zgelqt3.f zgemlqt.f + zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f + zgelq.f zlaswlq.f zlamswlq.f zgemlq.f + zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f + zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f + zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f + zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f + zungtsqr.f zungtsqr_row.f zunhr_col.f + zlatrs3.f ztrsyl3.f zgelst.f) + +set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f + zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f + zla_syrfsx_extended.f zla_syamv.f zla_syrcond_c.f zla_syrcond_x.f + zla_syrpvgrw.f zposvxx.f zporfsx.f zla_porfsx_extended.f + zla_porcond_c.f zla_porcond_x.f zla_porpvgrw.f zgbsvxx.f zgbrfsx.f + zla_gbrfsx_extended.f zla_gbamv.f zla_gbrcond_c.f zla_gbrcond_x.f + zla_gbrpvgrw.f zhesvxx.f zherfsx.f zla_herfsx_extended.f + zla_heamv.f zla_hercond_c.f zla_hercond_x.f zla_herpvgrw.f + zla_lin_berr.f zlarscl2.f zlascl2.f zla_wwaddw.f) + + +if(USE_XBLAS) + set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) +endif() + +list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f + DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f + DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f) +list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f + DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f + DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f) +list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f + DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f + DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f) +list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f + DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f + DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f) +message(STATUS "Building deprecated routines") + +set(DSLASRC spotrs.f) + +set(ZCLASRC cpotrs.f) + +set(SCATGEN slatm1.f slaran.f slarnd.f) + +set(SMATGEN slatms.f slatme.f slatmr.f slatmt.f + slagge.f slagsy.f slakf2.f slarge.f slaror.f slarot.f slatm2.f + slatm3.f slatm5.f slatm6.f slatm7.f slahilb.f) + +set(CMATGEN clatms.f clatme.f clatmr.f clatmt.f + clagge.f claghe.f clagsy.f clakf2.f clarge.f claror.f clarot.f + clatm1.f clarnd.f clatm2.f clatm3.f clatm5.f clatm6.f clahilb.f slatm7.f) + +set(DZATGEN dlatm1.f dlaran.f dlarnd.f) + +set(DMATGEN dlatms.f dlatme.f dlatmr.f dlatmt.f + dlagge.f dlagsy.f dlakf2.f dlarge.f dlaror.f dlarot.f dlatm2.f + dlatm3.f dlatm5.f dlatm6.f dlatm7.f dlahilb.f) + +set(ZMATGEN zlatms.f zlatme.f zlatmr.f zlatmt.f + zlagge.f zlaghe.f zlagsy.f zlakf2.f zlarge.f zlaror.f zlarot.f + zlatm1.f zlarnd.f zlatm2.f zlatm3.f zlatm5.f zlatm6.f zlahilb.f dlatm7.f) + +if(BUILD_SINGLE) + set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) + set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision") +endif() +if(BUILD_DOUBLE) + set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) + set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) + message(STATUS "Building Double Precision") +endif() +if(BUILD_COMPLEX) + set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision Complex") +endif() +if(BUILD_COMPLEX16) + set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) + endif () + message(STATUS "Building Double Precision Complex") +endif() + +else () + + message (STATUS "c lapack") +set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c + ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c + ../INSTALL/ilaver.c xerbla_array.c + ../INSTALL/slamch.c) + +set(SCLAUX + scombssq.c sbdsvdx.c sstevx.c sstein.c + sbdsdc.c + sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c + slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c + slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c + slagts.c slamrg.c slanst.c + slapy2.c slapy3.c slarnv.c + slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c + slarrk.c slarrr.c slaneg.c + slartg.c slaruv.c slas2.c slascl.c + slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c + slasd7.c slasd8.c slasda.c slasdq.c slasdt.c + slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c + slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c + ssteqr.c ssterf.c slaisnan.c sisnan.c + slartgp.c slartgs.c slarmm.c + ../INSTALL/second_${TIMER}.c) + +set(DZLAUX + dbdsdc.c + dbdsvdx.c dstevx.c dstein.c + dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c + dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c + dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c + dlagts.c dlamrg.c dlanst.c + dlapy2.c dlapy3.c dlarnv.c + dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c + dlarrk.c dlarrr.c dlaneg.c + dlartg.c dlaruv.c dlas2.c dlascl.c + dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c + dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c + dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c + dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c + dsteqr.c dsterf.c dlaisnan.c disnan.c + dlartgp.c dlartgs.c dlarmm.c + ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c) + +set(SLASRC + sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c + sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c + sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c + sgehd2.c sgehrd.c sgelq2.c sgelqf.c + sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c + sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c + sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c + sgetrf2.c sgetri.c + sggbak.c sggbal.c + sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c + sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c + sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c + sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c + shsein.c shseqr.c slabrd.c slacon.c slacn2.c + slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c + slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c + slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c + slansy.c slantb.c slantp.c slantr.c slanv2.c + slapll.c slapmt.c + slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c + slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c + slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c + slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c + slarrv.c slartv.c + slarz.c slarzb.c slarzt.c slasy2.c + slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c + slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c + sopgtr.c sopmtr.c sorg2l.c sorg2r.c + sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c + sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c + sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c + sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c + spbstf.c spbsv.c spbsvx.c + spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c + sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c + sppcon.c sppequ.c + spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c + spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c + ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c + ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c + sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c + ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c + ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c + ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c + ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c + ssyswapr.c ssytrs.c ssytrs2.c + ssyconv.c ssyconvf.c ssyconvf_rook.c + ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c + ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c + ssytri_rook.c ssycon_rook.c ssysv_rook.c + ssytf2_rk.c ssytrf_rk.c ssytrs_3.c + ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c + ssysv_aa.c ssytrf_aa.c ssytrs_aa.c + stbcon.c + stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c + stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c + stptrs.c + strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c + strtrs.c stzrzf.c sstemr.c + slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c + stfttr.c stpttf.c stpttr.c strttf.c strttp.c + sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c + sgeequb.c ssyequb.c spoequb.c sgbequb.c + sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c + sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c + sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c + stpqrt.c stpqrt2.c stpmqrt.c stprfb.c + sgelqt.c sgelqt3.c sgemlqt.c + sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c + sgelq.c slaswlq.c slamswlq.c sgemlq.c + stplqt.c stplqt2.c stpmlqt.c + ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c + ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c + ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c + sgesvdq.c slaorhr_col_getrfnp.c + slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c + slatrs3.c strsyl3.c sgelst.c) + +set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c + sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c + sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c + sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c + sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c + sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c + slascl2.c sla_wwaddw.c) + +set(CLASRC + cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c + cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c + cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c + cgehd2.c cgehrd.c cgelq2.c cgelqf.c + cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c + cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c + cgesc2.c cgesdd.c cgesvd.c cgesvdx.c + cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c + cgesvx.c cgetc2.c cgetrf2.c + cgetri.c + cggbak.c cggbal.c + cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c + cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c + cggsvd3.c cggsvp3.c + cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c + chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c + checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c + chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c + chetf2.c chetrd.c + chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c + chetrs.c chetrs2.c + chetf2_rook.c chetrf_rook.c chetri_rook.c + chetrs_rook.c checon_rook.c chesv_rook.c + chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c + chetrs_3.c checon_3.c chesv_rk.c + chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c + chgeqz.c chpcon.c chpev.c chpevd.c + chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c + chpsvx.c + chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c + clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c + claed0.c claed7.c claed8.c + claein.c claesy.c claev2.c clags2.c clagtm.c + clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c + clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c + clanhb.c clanhe.c + clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c + clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c + claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c + claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c + claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c + clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c + clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c + clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c + clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c + clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c + cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c + cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c + cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c + cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c + cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c + crot.c cspcon.c csprfs.c cspsv.c + cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c + cstegr.c cstein.c csteqr.c csycon.c + csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c + csytri2.c csytri2x.c csyswapr.c + csytrs.c csytrs2.c + csyconv.c csyconvf.c csyconvf_rook.c + csytf2_rook.c csytrf_rook.c csytrs_rook.c + csytri_rook.c csycon_rook.c csysv_rook.c + csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c + csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c + ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c + ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c + ctprfs.c ctptri.c + ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c + ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c + cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c + cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c + cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c + cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c + chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c + ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c + cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c + cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c + cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c + cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c + ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c + cgelqt.c cgelqt3.c cgemlqt.c + cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c + cgelq.c claswlq.c clamswlq.c cgemlq.c + ctplqt.c ctplqt2.c ctpmlqt.c + chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c + cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c + chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c + cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c + cungtsqr.c cungtsqr_row.c cunhr_col.c + clatrs3.c ctrsyl3.c cgelst.c) + +set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c + cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c + csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c + cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c + cposvxx.c cporfsx.c cla_porfsx_extended.c + cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c + cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c + cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c + chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c + cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c + cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c) + +set(DLASRC + dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c + dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c + dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c + dgehd2.c dgehrd.c dgelq2.c dgelqf.c + dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c + dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c + dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c + dgetrf2.c dgetri.c + dggbak.c dggbal.c + dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c + dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c + dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c + dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c + dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c + dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c + dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c + dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c + dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c + dlapll.c dlapmt.c + dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c + dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c + dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c + dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c + dlargv.c dlarrv.c dlartv.c + dlarz.c dlarzb.c dlarzt.c dlasy2.c + dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c + dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c + dopgtr.c dopmtr.c dorg2l.c dorg2r.c + dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c + dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c + dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c + dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c + dpbstf.c dpbsv.c dpbsvx.c + dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c + dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c + dppcon.c dppequ.c + dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c + dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c + dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c + dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c + dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c + dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c + dsycon.c dsyev.c dsyevd.c dsyevr.c + dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c + dsysv.c dsysvx.c + dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c + dsytri2.c dsytri2x.c dsyswapr.c + dsyconv.c dsyconvf.c dsyconvf_rook.c + dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c + dsytri_rook.c dsycon_rook.c dsysv_rook.c + dsytf2_rk.c dsytrf_rk.c dsytrs_3.c + dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c + dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c + dtbcon.c + dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c + dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c + dtptrs.c + dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c + dtrtrs.c dtzrzf.c dstemr.c + dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c + dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c + dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c + dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c + dgeequb.c dsyequb.c dpoequb.c dgbequb.c + dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c + dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c + dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c + dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c + dgelqt.c dgelqt3.c dgemlqt.c + dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c + dgelq.c dlaswlq.c dlamswlq.c dgemlq.c + dtplqt.c dtplqt2.c dtpmlqt.c + dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c + dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c + dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c + dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c + dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c + dlatrs3.c dtrsyl3.c dgelst.c) + +set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c + dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c + dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c + dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c + dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c + dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c + dlascl2.c dla_wwaddw.c) + +set(ZLASRC + zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c + zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c + zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c + zgehd2.c zgehrd.c zgelq2.c zgelqf.c + zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c + zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c + zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c + zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c + zgetc2.c zgetrf2.c + zgetri.c + zggbak.c zggbal.c + zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c + zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c + zggsvd3.c zggsvp3.c + zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c + zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c + zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c + zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c + zhetf2.c zhetrd.c + zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c + zhetrs.c zhetrs2.c + zhetf2_rook.c zhetrf_rook.c zhetri_rook.c + zhetrs_rook.c zhecon_rook.c zhesv_rook.c + zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c + zhetrs_3.c zhecon_3.c zhesv_rk.c + zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c + zhgeqz.c zhpcon.c zhpev.c zhpevd.c + zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c + zhpsvx.c + zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c + zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c + zlaed0.c zlaed7.c zlaed8.c + zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c + zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c + zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c + zlangt.c zlanhb.c + zlanhe.c + zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c + zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c + zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c + zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c + zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c + zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c + zlarfg.c zlarfgp.c zlarft.c + zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c + zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c + zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c + zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c + zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c + zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c + zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c + zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c + zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c + zrot.c zspcon.c zsprfs.c zspsv.c + zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c + zstegr.c zstein.c zsteqr.c zsycon.c + zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c + zsytri2.c zsytri2x.c zsyswapr.c + zsytrs.c zsytrs2.c + zsyconv.c zsyconvf.c zsyconvf_rook.c + zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c + zsytri_rook.c zsycon_rook.c zsysv_rook.c + zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c + zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c + ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c + ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c + ztprfs.c ztptri.c + ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c + ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c + zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c + zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c + zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c + zunmtr.c zupgtr.c + zupmtr.c izmax1.c dzsum1.c zstemr.c + zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c + zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c + ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c + zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c + zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c + zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c + zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c + ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c + ztplqt.c ztplqt2.c ztpmlqt.c + zgelqt.c zgelqt3.c zgemlqt.c + zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c + zgelq.c zlaswlq.c zlamswlq.c zgemlq.c + zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c + zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c + zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c + zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c + zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c) + +set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c + zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c + zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c + zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c + zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c + zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c + zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c + zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c + zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c) + + +if(USE_XBLAS) + set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC}) +endif() + +list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c + DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c + DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c) +list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c + DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c + DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c) +list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c + DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c + DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c) +list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c + DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c + DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c) +message(STATUS "Building deprecated routines") + +set(DSLASRC spotrs.c) + +set(ZCLASRC cpotrs.c) + +set(SCATGEN slatm1.c slaran.c slarnd.c) + +set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c + slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c + slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c) + +set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c + clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c + clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c) + +set(DZATGEN dlatm1.c dlaran.c dlarnd.c) + +set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c + dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c + dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c) + +set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c + zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c + zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c) + +if(BUILD_SINGLE) + set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX}) + set(LA_GEN_SRC ${SMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision") +endif() +if(BUILD_DOUBLE) + set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX}) + set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN}) + message(STATUS "Building Double Precision") +endif() +if(BUILD_COMPLEX) + set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) + message(STATUS "Building Single Precision Complex") +endif() +if(BUILD_COMPLEX16) + set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) + SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c) + endif () + message(STATUS "Building Double Precision Complex") +endif() + +endif() + +# add lapack-netlib folder to the sources +set(LA_SOURCES "") +foreach (LA_FILE ${LA_REL_SRC}) + list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}") +endforeach () +foreach (LA_FILE ${LA_GEN_SRC}) + list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}") +endforeach () + +if (NOT C_LAPACK) + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + if (${F_COMPILER} STREQUAL "GFORTRAN") + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") + endif() +else () + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") +endif () diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 340ea6d6cb..3a93521973 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -318,6 +318,8 @@ set(CSRC lapacke_clacn2.c lapacke_clag2z.c lapacke_clag2z_work.c + lapacke_clangb.c + lapacke_clangb_work.c lapacke_clange.c lapacke_clange_work.c lapacke_clanhe.c @@ -803,6 +805,8 @@ set(DSRC lapacke_dlag2s_work.c lapacke_dlamch.c lapacke_dlamch_work.c + lapacke_dlangb.c + lapacke_dlangb_work.c lapacke_dlange.c lapacke_dlange_work.c lapacke_dlansy.c @@ -1381,6 +1385,8 @@ set(SSRC lapacke_slag2d_work.c lapacke_slamch.c lapacke_slamch_work.c + lapacke_slangb.c + lapacke_slangb_work.c lapacke_slange.c lapacke_slange_work.c lapacke_slansy.c @@ -2089,6 +2095,8 @@ set(ZSRC lapacke_zlacrm_work.c lapacke_zlag2c.c lapacke_zlag2c_work.c + lapacke_zlangb.c + lapacke_zlangb_work.c lapacke_zlange.c lapacke_zlange_work.c lapacke_zlanhe.c @@ -2481,6 +2489,8 @@ set(Utils_SRC lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c + lapacke_ctz_nancheck.c lapacke_ctz_trans.c lapacke_dtz_nancheck.c lapacke_dtz_trans.c + lapacke_stz_nancheck.c lapacke_stz_trans.c lapacke_ztz_nancheck.c lapacke_ztz_trans.c ) set(LAPACKE_REL_SRC "") diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 0bd49f9968..7e120af866 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -2,7 +2,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libsuffix=@SUFFIX64_UNDERSCORE@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ -openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ +openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index f32162f04d..a33acbbd47 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -823,6 +823,41 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(CGEMM3M_UNROLL_N 4) set(ZGEMM3M_UNROLL_M 4) set(ZGEMM3M_UNROLL_N 4) + elseif ("${TCORE}" STREQUAL "ARMV5") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t32\n" + "#define L2_SIZE\t512488\n" + "#define L2_LINESIZE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define L2_ASSOCIATIVE\t4\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + elseif ("${TCORE}" STREQUAL "ARMV6") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t32\n" + "#define L2_SIZE\t512488\n" + "#define L2_LINESIZE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define L2_ASSOCIATIVE\t4\n" + "#define HAVE_VFP\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) elseif ("${TCORE}" STREQUAL "ARMV7") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE\t65536\n" @@ -886,7 +921,11 @@ else () set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 8) endif () +if ("${TCORE}" STREQUAL "CORTEXA53") + set(DGEMM_UNROLL_M 4) +else () set(DGEMM_UNROLL_M 8) +endif () set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_N 4) @@ -1319,16 +1358,25 @@ else(NOT CMAKE_CROSSCOMPILING) set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") - file(MAKE_DIRECTORY ${GETARCH_DIR}) - configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY) + file(MAKE_DIRECTORY "${GETARCH_DIR}") + configure_file("${TARGET_CONF_TEMP}" "${GETARCH_DIR}/${TARGET_CONF}" COPYONLY) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") - try_compile(GETARCH_RESULT ${GETARCH_DIR} - SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" - OUTPUT_VARIABLE GETARCH_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} - ) - + if (CMAKE_ASM_COMPILER_ID STREQUAL "") + try_compile(GETARCH_RESULT "${GETARCH_DIR}" + SOURCES ${GETARCH_SRC} + CMAKE_FLAGS "-DCMAKE_ASM_COMPILER=${CMAKE_C_COMPILER}" + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" + ) + else() + try_compile(GETARCH_RESULT "${GETARCH_DIR}" + SOURCES ${GETARCH_SRC} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" + ) + endif() if (NOT ${GETARCH_RESULT}) MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") endif () @@ -1357,19 +1405,19 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}") # append config data from getarch to the TARGET file and read in CMake vars - file(APPEND ${TARGET_CONF_TEMP} ${GETARCH_CONF_OUT}) + file(APPEND "${TARGET_CONF_TEMP}" ${GETARCH_CONF_OUT}) ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") - file(MAKE_DIRECTORY ${GETARCH2_DIR}) - configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY) + file(MAKE_DIRECTORY "${GETARCH2_DIR}") + configure_file("${TARGET_CONF_TEMP}" "${GETARCH2_DIR}/${TARGET_CONF}" COPYONLY) if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") - try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + try_compile(GETARCH2_RESULT "${GETARCH2_DIR}" + SOURCES "${PROJECT_SOURCE_DIR}/getarch_2nd.c" COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" OUTPUT_VARIABLE GETARCH2_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} + COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" ) if (NOT ${GETARCH2_RESULT}) @@ -1382,9 +1430,9 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" 0 OUTPUT_VARIABL execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT) # append config data from getarch_2nd to the TARGET file and read in CMake vars - file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT}) + file(APPEND "${TARGET_CONF_TEMP}" ${GETARCH2_CONF_OUT}) - configure_file(${TARGET_CONF_TEMP} ${TARGET_CONF_DIR}/${TARGET_CONF} COPYONLY) + configure_file("${TARGET_CONF_TEMP}" "${TARGET_CONF_DIR}/${TARGET_CONF}" COPYONLY) ParseGetArchVars(${GETARCH2_MAKE_OUT}) diff --git a/cmake/system.cmake b/cmake/system.cmake index a9fc0f4b7f..631e7fe69f 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -172,9 +172,9 @@ if (DEFINED TARGET) endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake -exhaustive-register-search") else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -exhaustive-register-search") endif() endif() endif() @@ -188,23 +188,45 @@ if (DEFINED TARGET) endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids -exhaustive-register-search") else() - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -exhaustive-register-search") endif() endif() endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -exhaustive-register-search") + endif() + endif() + + if (((${TARGET} STREQUAL ZEN) AND HAVE_AVX512VL) AND NOT NO_AVX512) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.99) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 15.99) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -exhaustive-register-search") + endif() endif() - if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) + + if ((${TARGET} STREQUAL HASWELL OR (${TARGET} STREQUAL ZEN AND NOT HAVE_AVX512VL)) AND NOT NO_AVX2) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma") endif() endif() if (DEFINED HAVE_AVX) diff --git a/common.h b/common.h index 00d1d0baf6..4eeeb8d552 100644 --- a/common.h +++ b/common.h @@ -90,7 +90,7 @@ extern "C" { #endif #include -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_QNX) #include #include #endif @@ -107,7 +107,7 @@ extern "C" { #endif #endif -#ifdef OS_HAIKU +#if defined(OS_HAIKU) || defined(OS_QNX) #define NO_SYSV_IPC #endif @@ -387,6 +387,10 @@ typedef int blasint; #endif */ +#ifdef __EMSCRIPTEN__ +#define YIELDING +#endif + #ifndef YIELDING #define YIELDING sched_yield() #endif diff --git a/common_alpha.h b/common_alpha.h index f1ea8ff94e..021eb93ae2 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -43,7 +43,7 @@ #define MB asm("mb") #define WMB asm("wmb") -#define RMB asm("rmb") +#define RMB asm("mb") static void __inline blas_lock(unsigned long *address){ #ifndef __DECC diff --git a/common_macro.h b/common_macro.h index d2fa822c2e..3226d0f11b 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2612,7 +2612,7 @@ #ifndef ASSEMBLER #if !defined(DYNAMIC_ARCH) \ && (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \ - || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)) + || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA)) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/common_mips64.h b/common_mips64.h index 287459e7d4..006cf33e41 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -86,7 +86,9 @@ static inline unsigned int rpcc(void){ //__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory"); //ret=tmp; __asm__ __volatile__(".set push \n" +#if !defined(__mips_isa_rev) || __mips_isa_rev < 2 ".set mips32r2\n" +#endif "rdhwr %0, $2\n" ".set pop": "=r"(ret):: "memory"); @@ -99,7 +101,9 @@ static inline unsigned int rpcc(void){ static inline int WhereAmI(void){ int ret=0; __asm__ __volatile__(".set push \n" +#if !defined(__mips_isa_rev) || __mips_isa_rev < 2 ".set mips32r2\n" +#endif "rdhwr %0, $0\n" ".set pop": "=r"(ret):: "memory"); return ret; @@ -197,9 +201,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) +#if defined(__mips_isa_rev) && __mips_isa_rev >= 6 +#define ASSEMBLER_ARCH mips64r6 +#else +#define ASSEMBLER_ARCH mips64 +#endif + #define PROLOGUE \ .text ;\ - .set mips64 ;\ + .set ASSEMBLER_ARCH ;\ .align 5 ;\ .globl REALNAME ;\ .ent REALNAME ;\ diff --git a/common_param.h b/common_param.h index 31fba9059e..1854570f14 100644 --- a/common_param.h +++ b/common_param.h @@ -47,9 +47,10 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#ifdef BUILD_BFLOAT16 +#if BUILD_BFLOAT16 == 1 int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; + int sbgemm_align_k; void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); @@ -160,51 +161,59 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); #endif #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE == 1) || (BUILD_DOUBLE == 1) || (BUILD_COMPLEX == 1) || (BUILD_COMPLEX16 == 1) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; #endif int exclusive_cache; -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE == 1) || (BUILD_COMPLEX == 1) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE ==1) || (BUILD_COMPLEX==1) BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) float (*ssum_k) (BLASLONG, float *, BLASLONG); #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -219,7 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) #ifdef SMALL_MATRIX_OPT int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); @@ -255,7 +264,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE==1) int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -287,12 +297,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif -#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; #endif -#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -301,23 +311,21 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); -#endif -#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) double (*dsum_k) (BLASLONG, double *, BLASLONG); #endif -#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); #endif -#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -325,13 +333,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif -#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -340,7 +348,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) #ifdef SMALL_MATRIX_OPT int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); @@ -354,6 +362,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); #endif +#endif +#if (BUILD_DOUBLE==1) int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -500,23 +510,25 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - + float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); + +BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); + int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -710,7 +722,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16 == 1) int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -1092,34 +1104,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -1131,7 +1143,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -1143,21 +1155,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -1169,7 +1181,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -1181,16 +1193,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #endif } gotoblas_t; @@ -1206,7 +1218,7 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache -#ifdef BUILD_BFLOAT16 +#if (BUILD_BFLOAT16==1) #define SBGEMM_P gotoblas -> sbgemm_p #define SBGEMM_Q gotoblas -> sbgemm_q #define SBGEMM_R gotoblas -> sbgemm_r @@ -1215,7 +1227,7 @@ extern gotoblas_t *gotoblas; #define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn #endif -#if defined (BUILD_SINGLE) +#if (BUILD_SINGLE==1) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -1224,13 +1236,21 @@ extern gotoblas_t *gotoblas; #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#if (BUILD_SINGLE != 1) +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R 1024 +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif #endif #define QGEMM_P gotoblas -> qgemm_p @@ -1240,14 +1260,14 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn -#ifndef BUILD_SINGLE +#if (BUILD_SINGLE != 1) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R 1024 @@ -1257,14 +1277,14 @@ extern gotoblas_t *gotoblas; #endif #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn -#ifndef BUILD_DOUBLE +#if (BUILD_DOUBLE != 1) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R 1024 @@ -1272,6 +1292,14 @@ extern gotoblas_t *gotoblas; #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #endif +#if (BUILD_COMPLEX != 1) +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#endif #endif #define XGEMM_P gotoblas -> xgemm_p @@ -1318,7 +1346,7 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif -#ifdef BUILD_BFLOAT16 +#if (BUILD_BFLOAT16 == 1) #define SBGEMM_P SBGEMM_DEFAULT_P #define SBGEMM_Q SBGEMM_DEFAULT_Q #define SBGEMM_R SBGEMM_DEFAULT_R diff --git a/common_thread.h b/common_thread.h index a18df0d78b..05e1d5489d 100644 --- a/common_thread.h +++ b/common_thread.h @@ -53,6 +53,7 @@ extern void goto_set_num_threads(int nthreads); /* Global Parameter */ extern int blas_cpu_number; extern int blas_num_threads; +extern int blas_num_threads_set; extern int blas_omp_linked; #define BLAS_LEGACY 0x8000U @@ -139,7 +140,11 @@ extern int blas_server_avail; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP - int openmp_nthreads=omp_get_max_threads(); +int openmp_nthreads; + if (blas_num_threads_set == 0) + openmp_nthreads=omp_get_max_threads(); + else + openmp_nthreads=blas_cpu_number; #endif #ifndef USE_OPENMP diff --git a/cpuid_alpha.c b/cpuid_alpha.c index 58dccdefc8..e0e019af21 100644 --- a/cpuid_alpha.c +++ b/cpuid_alpha.c @@ -59,6 +59,11 @@ void get_subarchitecture(void){ printf("ev%d", implver() + 4); } + +void get_corename(void){ + printf("EV%d", implver() + 4); +} + void get_subdirname(void){ printf("alpha"); } diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 89ec186325..1080ea9740 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -202,10 +202,14 @@ int detect(void) return CPU_CORTEXA510; else if (strstr(cpu_part, "0xd47")) return CPU_CORTEXA710; + else if (strstr(cpu_part, "0xd4d")) //A715 + return CPU_CORTEXA710; else if (strstr(cpu_part, "0xd44")) return CPU_CORTEXX1; else if (strstr(cpu_part, "0xd4c")) return CPU_CORTEXX2; + else if (strstr(cpu_part, "0xd4e")) //X3 + return CPU_CORTEXX2; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) diff --git a/cpuid_mips.c b/cpuid_mips.c index d787e7120d..77567a2e5f 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -165,7 +165,9 @@ void get_cpuconfig(void){ }else{ printf("#define UNKNOWN\n"); } - if (!get_feature("msa")) printf("#define NO_MSA\n"); +#ifndef NO_MSA + if (get_feature("msa")) printf("#define HAVE_MSA\n"); +#endif } void get_libname(void){ diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 8753ee3f09..8895cb1578 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,16 +70,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3R3 2 -#define CPU_LOONGSON3R4 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_MIPS64_GENERIC 1 +#define CPU_SICORTEX 2 +#define CPU_LOONGSON3R3 3 +#define CPU_LOONGSON3R4 4 +#define CPU_I6400 5 +#define CPU_P6600 6 +#define CPU_I6500 7 static char *cpuname[] = { "UNKNOWN", + "MIPS64_GENERIC" "SICORTEX", "LOONGSON3R3", "LOONGSON3R4", @@ -113,8 +115,11 @@ int detect(void){ return CPU_SICORTEX; } } + + return CPU_MIPS64_GENERIC; +#else + return CPU_UNKNOWN; #endif - return CPU_UNKNOWN; } char *get_corename(void){ @@ -136,9 +141,11 @@ void get_subarchitecture(void){ printf("P6600"); }else if(detect()==CPU_I6500){ printf("I6500"); - }else{ + }else if(detect()==CPU_SICORTEX){ printf("SICORTEX"); - } + }else{ + printf("MIPS64_GENERIC"); + } } void get_subdirname(void){ @@ -201,7 +208,9 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } - if (!get_feature("msa")) printf("#define NO_MSA\n"); +#ifndef NO_MSA + if (get_feature("msa")) printf("#define HAVE_MSA\n"); +#endif } void get_libname(void){ @@ -215,8 +224,8 @@ void get_libname(void){ printf("p6600\n"); }else if(detect()==CPU_I6500) { printf("i6500\n"); - }else{ - printf("mips64\n"); + }else { + printf("mips64_generic\n"); } } diff --git a/cpuid_x86.c b/cpuid_x86.c index 4ac1de0472..4afa931f06 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1544,6 +1544,17 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; + case 11: //family 6 exmodel 11 + switch (model) { + case 7: // Raptor Lake + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; } break; case 0x7: @@ -2334,6 +2345,18 @@ int get_coretype(void){ return CORE_NEHALEM; } + case 11: + switch (model) { + case 7: // Raptor Lake +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; diff --git a/ctest.c b/ctest.c index df628b1d42..2ccae8dcc1 100644 --- a/ctest.c +++ b/ctest.c @@ -173,3 +173,8 @@ HAVE_C11 ARCH_E2K #endif +#if defined(__EMSCRIPTEN__) +ARCH_RISCV64 +OS_WINDOWS +#endif + diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index e779fb168d..91338b73b6 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -40,7 +40,7 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() add_test(NAME "x${float_char}cblat1" @@ -65,7 +65,7 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() add_test(NAME "x${float_char}cblat2" @@ -90,7 +90,7 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) - if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() add_test(NAME "x${float_char}cblat3" diff --git a/ctest/Makefile b/ctest/Makefile index 236913c345..0fb2450d28 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -237,7 +237,7 @@ endif ifeq ($(BUILD_DOUBLE),1) # Double real -ifeq ($(NOFORTRAN),0) +ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN))) xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) @@ -256,7 +256,7 @@ endif ifeq ($(BUILD_COMPLEX),1) # Single complex -ifeq ($(NOFORTRAN),0) +ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN))) xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) @@ -278,7 +278,7 @@ endif ifeq ($(BUILD_COMPLEX16),1) # Double complex -ifeq ($(NOFORTRAN),0) +ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN))) xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c index 4993d31bb7..57e4707a97 100644 --- a/ctest/c_sblat1c.c +++ b/ctest/c_sblat1c.c @@ -969,7 +969,7 @@ real *sfac; 1.17 }; /* Local variables */ - extern /* Subroutine */ srottest_(); + extern /* Subroutine */ void srottest_(); static integer i__, k, ksize; extern /* Subroutine */ int stest_(), srotmtest_(); static integer ki, kn; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 4a8e193be2..b7328876b4 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -304,6 +304,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; } + BLASLONG pad_min_l = min_l; +#if defined(HALF) +#if defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); +#else + pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; +#endif +#endif + /* First, we have to move data A to L2 cache */ min_i = m_to - m_from; l1stride = 1; @@ -350,7 +359,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - sb + min_l * (jjs - js) * COMPSIZE * l1stride); + sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(outercost); @@ -358,10 +367,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); + sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #else KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, - sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); + sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); #endif STOP_RPCC(kernelcost); diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index dfc7107b87..02b60b50d9 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -324,6 +324,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } + + BLASLONG pad_min_l = min_l; + +#if defined(HALF) +#if defined(DYNAMIC_ARCH) + pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); +#else + pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; +#endif +#endif /* Determine step size in m * Note: We are currently on the first step in m @@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Copy part of local region of B into workspace */ START_RPCC(); OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride); + buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(copy_B); /* Apply kernel with local region of A and part of local region of B */ START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride, + sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); STOP_RPCC(kernel); diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 9cfd825ecb..051513f272 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -470,9 +470,13 @@ blas_queue_t *tscq; #endif #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); #endif +#endif #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; @@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ queue -> position = pos; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); #endif +#endif #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 1a5fd06a32..2e0c0f38c1 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -69,6 +69,8 @@ int blas_server_avail = 0; +extern int openblas_omp_adaptive_env(); + static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; #ifdef HAVE_C11 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -98,6 +100,8 @@ static void adjust_thread_buffers() { void goto_set_num_threads(int num_threads) { + blas_num_threads_set = 1; + if (num_threads < 0) blas_num_threads_set = 0; if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -108,8 +112,6 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; - omp_set_num_threads(blas_cpu_number); - adjust_thread_buffers(); #if defined(ARCH_MIPS64) //set parameters for different number of threads. @@ -282,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ sb = queue -> sb; #ifdef CONSISTENT_FPCSR +#ifdef __aarch64__ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); +#else __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif #endif if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { @@ -381,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ #ifdef CONSISTENT_FPCSR for (i = 0; i < num; i ++) { +#ifdef __aarch64__ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode)); +#else __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); +#endif } #endif diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 33b58f1340..afa33ccccd 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } @@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9a693b06f0..f619309834 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){ char message[128]; //char mname[20]; - for ( i=1 ; i <= 24; i++) + for ( i=1 ; i <= 25; i++) { if (!strncasecmp(coretype,corename[i],20)) { diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index d88b1da404..0f47b287cc 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -125,8 +125,13 @@ extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; +#ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +#else +#define gotoblas_NEOVERSEV1 gotoblas_ARMV8 +#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#endif extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_CORTEXA55; #endif @@ -237,7 +242,7 @@ static gotoblas_t *get_coretype(void) { p = (char *) NULL ; infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r"); if (!infile) return NULL; - fgets(buffer, sizeof(buffer), infile); + (void)fgets(buffer, sizeof(buffer), infile); midr_el1=strtoul(buffer,NULL,16); fclose(infile); #else @@ -274,10 +279,12 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA73; case 0xd0c: // Neoverse N1 return &gotoblas_NEOVERSEN1; +#ifndef NO_SVE case 0xd49: return &gotoblas_NEOVERSEN2; case 0xd40: return &gotoblas_NEOVERSEV1; +#endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; } diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c index 9fd19d7397..7fc347b0c0 100644 --- a/driver/others/dynamic_mips64.c +++ b/driver/others/dynamic_mips64.c @@ -38,22 +38,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#if (defined OS_LINUX || defined OS_ANDROID) +#include +#include + +#ifndef HWCAP_LOONGSON_CPUCFG +#define HWCAP_LOONGSON_CPUCFG (1 << 14) +#endif +#endif + +#ifdef DYNAMIC_LIST +extern gotoblas_t gotoblas_MIPS64_GENERIC; +#ifdef DYN_LOONGSON3R3 extern gotoblas_t gotoblas_LOONGSON3R3; +#else +#define gotoblas_LOONGSON3R3 gotoblas_MIPS64_GENERIC +#endif +#ifdef DYN_LOONGSON3R4 extern gotoblas_t gotoblas_LOONGSON3R4; +#else +#define gotoblas_LOONGSON3R4 gotoblas_MIPS64_GENERIC +#endif +#else +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; +extern gotoblas_t gotoblas_MIPS64_GENERIC; +#endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 2 +#define NUM_CORETYPES 3 static char *corename[] = { + "MIPS64_GENERIC" "loongson3r3", "loongson3r4", "UNKNOWN" }; char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; - if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + if (gotoblas == &gotoblas_MIPS64_GENERIC) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[1]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[2]; return corename[NUM_CORETYPES]; } @@ -73,77 +99,32 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { - case 0: return (&gotoblas_LOONGSON3R3); - case 1: return (&gotoblas_LOONGSON3R4); + case 0: return (&gotoblas_MIPS64_GENERIC); + case 1: return (&gotoblas_LOONGSON3R3); + case 2: return (&gotoblas_LOONGSON3R4); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); return NULL; } +#if (defined OS_LINUX || defined OS_ANDROID) #define MMI_MASK 0x00000010 #define MSA_MASK 0x00000020 -int fd[2]; -int support_cpucfg; - -static void handler(int signum) -{ - close(fd[1]); - exit(1); -} - -/* Brief : Function to check if cpucfg supported on loongson - * Return: 1 supported - * 0 not supported - */ -static int cpucfg_test(void) { - pid_t pid; - int status = 0; - - support_cpucfg = 0; - pipe(fd); - pid = fork(); - if (pid == 0) { /* Subprocess */ - struct sigaction act; - close(fd[0]); - /* Set signal action for SIGILL. */ - act.sa_handler = handler; - sigaction(SIGILL,&act,NULL); - - /* Execute cpucfg in subprocess. */ - __asm__ volatile( - ".insn \n\t" - ".word (0xc8080118) \n\t" - ::: - ); - support_cpucfg = 1; - write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); - close(fd[1]); - exit(0); - } else if (pid > 0){ /* Parent process*/ - close(fd[1]); - if ((waitpid(pid,&status,0) <= 0) || - (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) - support_cpucfg = 0; - close(fd[0]); - } else { - support_cpucfg = 0; - } - - return support_cpucfg; -} - static gotoblas_t *get_coretype_from_cpucfg(void) { int flag = 0; __asm__ volatile( + ".set push \n\t" + ".set noat \n\t" ".insn \n\t" - "dli $8, 0x01 \n\t" - ".word (0xc9084918) \n\t" - "usw $9, 0x00(%0) \n\t" + "dli $1, 0x01 \n\t" + ".word (0xc8080118) \n\t" + "move %0, $1 \n\t" + ".set pop \n\t" + : "=r"(flag) + : : - : "r"(&flag) - : "memory" ); if (flag & MSA_MASK) return (&gotoblas_LOONGSON3R4); @@ -153,7 +134,7 @@ static gotoblas_t *get_coretype_from_cpucfg(void) { } static gotoblas_t *get_coretype_from_cpuinfo(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; @@ -176,17 +157,19 @@ static gotoblas_t *get_coretype_from_cpuinfo(void) { return NULL; } #endif - return NULL; + return NULL; } +#endif static gotoblas_t *get_coretype(void) { - int ret = 0; - - ret = cpucfg_test(); - if (ret == 1) - return get_coretype_from_cpucfg(); - else - return get_coretype_from_cpuinfo(); +#if (!defined OS_LINUX && !defined OS_ANDROID) + return NULL; +#else + if (!(getauxval(AT_HWCAP) & HWCAP_LOONGSON_CPUCFG)) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +#endif } void gotoblas_dynamic_init(void) { @@ -208,9 +191,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) { - snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + snprintf(coremsg, 128, "Falling back to MIPS64_GENEIRC\n"); openblas_warning(1, coremsg); - gotoblas = &gotoblas_LOONGSON3R3; + gotoblas = &gotoblas_MIPS64_GENERIC; } if (gotoblas && gotoblas->init) { diff --git a/driver/others/init.c b/driver/others/init.c index cc3145a623..cd10e8d369 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) { if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS"); + if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS"); + numnodes = 1; if (numprocs == 1) { diff --git a/driver/others/memory.c b/driver/others/memory.c index a9b4650eb4..4493b7d711 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -249,8 +249,11 @@ int get_num_procs(void) { #if defined(USE_OPENMP) #if _OPENMP >= 201511 + int i,n; + n = 0; ret = omp_get_num_places(); - if (ret >0 ) nums = ret; + if (ret > 0) for (i=0; i 0) nums = n; #endif return (nums > 0 ? nums : 2); #endif @@ -419,6 +422,8 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; +int blas_num_threads_set = 0; + int goto_get_num_procs (void) { return blas_cpu_number; } @@ -1820,8 +1825,11 @@ int get_num_procs(void) { #if defined(USE_OPENMP) /* if (omp_get_proc_bind() != omp_proc_bind_false) */ #if _OPENMP >= 201511 + int i,n; + n = 0; ret = omp_get_num_places(); - if (ret >0 ) nums = ret; + if (ret > 0) for (i=0;i 0) nums = n; #endif return (nums > 0 ? nums :2); #endif @@ -1988,6 +1996,8 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; +int blas_num_threads_set = 0; + int goto_get_num_procs (void) { return blas_cpu_number; } diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 6174d9b75e..0b38d1887c 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -283,6 +283,7 @@ The numbers of threads in the thread pool. This value is equal or large than blas_cpu_number. This means some threads are sleep. */ int blas_num_threads = 0; +int blas_num_threads_set = 0; int goto_get_num_procs (void) { return blas_cpu_number; diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c index ef91a08e6e..35b2270d44 100644 --- a/driver/others/openblas_env.c +++ b/driver/others/openblas_env.c @@ -67,10 +67,16 @@ void openblas_read_env() { openblas_env_thread_timeout=(unsigned int)ret; ret=0; - if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; openblas_env_openblas_num_threads=ret; + ret=0; + if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + if(ret != 0 || openblas_env_openblas_num_threads == 0) + openblas_env_openblas_num_threads=ret; + ret=0; if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); if(ret<0) ret=0; diff --git a/exports/gensymbol b/exports/gensymbol index 83222a2157..f05de626f7 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -4000,6 +4000,22 @@ case "$p1" in no_underscore_objs="$no_underscore_objs $misc_common_objs" printf 'int main(void){\n' + for obj in $underscore_objs; do + [ "$obj" != "xerbla" ] && printf 'extern void %s%s%s%s();\n' \ + "$symbolprefix" "$obj" "$bu" "$symbolsuffix" + done + + for obj in $need_2underscore_objs; do + printf 'extern void %s%s%s%s%s();\n' \ + "$symbolprefix" "$obj" "$bu" "$bu" "$symbolsuffix" + done + + for obj in $no_underscore_objs; do + printf 'extern void %s%s%s();\n' \ + "$symbolprefix" "$obj" "$symbolsuffix" + done + + printf '\n' for obj in $underscore_objs; do [ "$obj" != "xerbla" ] && printf '%s%s%s%s();\n' \ "$symbolprefix" "$obj" "$bu" "$symbolsuffix" diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index ac62bc058e..e38a3cc893 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -3955,6 +3955,18 @@ @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); print "int main(void){\n"; + foreach $objs (@underscore_objs) { + print "extern void ", $symbolprefix, $objs, $bu, $symbolsuffix, "();\n" if $objs ne "xerbla"; + } + + foreach $objs (@need_2underscore_objs) { + print "extern void ", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "();\n"; + } + + foreach $objs (@no_underscore_objs) { + print "extern void ", $symbolprefix, $objs, $symbolsuffix, "();\n"; + } + foreach $objs (@underscore_objs) { print $symbolprefix, $objs, $bu, $symbolsuffix, "();\n" if $objs ne "xerbla"; } diff --git a/f_check b/f_check index bb13e16403..d071e016e1 100755 --- a/f_check +++ b/f_check @@ -82,10 +82,10 @@ else vendor=FUJITSU openmp='-Kopenmp' ;; - *Cray*) + *Hewlett*) vendor=CRAY openmp='-fopenmp' - ;; + ;; *GNU*|*GCC*) v="${data#*GCC: *\) }" @@ -102,7 +102,7 @@ else vendor=FLANG openmp='-fopenmp' ;; - *ifx*) + *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' ;; @@ -117,6 +117,10 @@ else esac fi ;; + *Cray*) + vendor=CRAY + openmp='-fopenmp' + ;; *g95*) vendor=G95 openmp='' diff --git a/f_check.pl b/f_check.pl index cfc7331c21..889aedd050 100644 --- a/f_check.pl +++ b/f_check.pl @@ -76,11 +76,6 @@ $vendor = FUJITSU; $openmp = "-Kopenmp"; - } elsif ($data =~ /Cray/) { - - $vendor = CRAY; - $openmp = "-fopenmp"; - } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ s/\(+.*?\)+//g; @@ -95,7 +90,7 @@ if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; - } elsif ($compiler =~ /ifx/) { + } elsif ($compiler =~ /ifort/ || $compiler =~ /ifx/) { $vendor = INTEL; $openmp = "-fopenmp"; } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { @@ -106,6 +101,10 @@ $openmp = ""; } } + } elsif ($data =~ /Cray/) { + + $vendor = CRAY; + $openmp = "-fopenmp"; } diff --git a/getarch.c b/getarch.c index 7761551ea9..937a8db68c 100644 --- a/getarch.c +++ b/getarch.c @@ -131,6 +131,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440 */ /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ +/* #define FORCE_MIPS64_GENERIC */ /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3R3 */ /* #define FORCE_LOONGSON3R4 */ @@ -146,6 +147,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SPARCV7 */ /* #define FORCE_ZARCH_GENERIC */ /* #define FORCE_Z13 */ +/* #define FORCE_EV4 */ +/* #define FORCE_EV5 */ +/* #define FORCE_EV6 */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -915,6 +919,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "CELL" #endif +#ifdef FORCE_MIPS64_GENERIC +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "MIPS64_GENERIC" +#define SUBDIRNAME "mips64" +#define ARCHCONFIG "-DMIPS64_GENERIC " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "mips64_generic" +#define CORENAME "MIPS64_GENERIC" +#else +#endif + #ifdef FORCE_SICORTEX #define FORCE #define ARCHITECTURE "MIPS" @@ -951,7 +969,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 -DHAVE_MSA" #define LIBNAME "loongson3r4" #define CORENAME "LOONGSON3R4" #else @@ -965,7 +983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DLOONGSON3R5 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" #define LIBNAME "loongson3r5" #define CORENAME "LOONGSON3R5" #else @@ -979,7 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DLOONGSON2K1000 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" #define LIBNAME "loongson2k1000" #define CORENAME "LOONGSON2K1000" #else @@ -993,7 +1011,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DLOONGSONGENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" #define LIBNAME "loongsongeneric" #define CORENAME "LOONGSONGENERIC" #else @@ -1007,7 +1025,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DI6400 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DHAVE_MSA " #define LIBNAME "i6400" #define CORENAME "I6400" #else @@ -1035,7 +1053,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DP5600 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8" #define LIBNAME "p5600" #define CORENAME "P5600" #else @@ -1049,7 +1067,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS1004K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8" #define LIBNAME "mips1004K" #define CORENAME "MIPS1004K" #else @@ -1063,7 +1081,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS24K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8" #define LIBNAME "mips24K" #define CORENAME "MIPS24K" #else @@ -1077,7 +1095,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DI6500 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DHAVE_MSA" #define LIBNAME "i6500" #define CORENAME "I6500" #else @@ -1392,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ - "-march=armv8.4-a -mtune=neoverse-v1" + "-march=armv8.4-a+sve -mtune=neoverse-v1" #define LIBNAME "neoversev1" #define CORENAME "NEOVERSEV1" #endif @@ -1601,6 +1619,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z14" #endif +#ifdef FORCE_EV4 +#define FORCE +#define ARCHITECTURE "ALPHA" +#define SUBARCHITECTURE "ev4" +#define ARCHCONFIG "-DEV4 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=8192 " +#define LIBNAME "ev4" +#define CORENAME "EV4" +#endif + +#ifdef FORCE_EV5 +#define FORCE +#define ARCHITECTURE "ALPHA" +#define SUBARCHITECTURE "ev5" +#define ARCHCONFIG "-DEV5 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=8192 " +#define LIBNAME "ev5" +#define CORENAME "EV5" +#endif + +#ifdef FORCE_EV6 +#define FORCE +#define ARCHITECTURE "ALPHA" +#define SUBARCHITECTURE "ev6" +#define ARCHCONFIG "-DEV6 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=8192 " +#define LIBNAME "ev6" +#define CORENAME "EV6" +#endif + #ifdef FORCE_C910V #define FORCE #define ARCHITECTURE "RISCV64" @@ -1777,7 +1831,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) printf("CORE=%s\n", get_corename()); #endif #endif diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 0b2998237d..4e082928bb 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES # these do not have separate 'z' sources set(BLAS3_SOURCES gemm.c symm.c - trsm.c syrk.c syr2k.c + trsm.c syrk.c syr2k.c gemmt.c ) set(BLAS3_MANGLED_SOURCES @@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK) ) GenerateNamedObjects("${LAPACK_SOURCES}") + if (NOT RELAPACK_REPLACE) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) + else () + GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) + GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) + endif() endif () if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) diff --git a/interface/Makefile b/interface/Makefile index abdac96e18..3db4b2b6d4 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -44,12 +44,12 @@ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) + sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -76,7 +76,7 @@ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) + dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -92,8 +92,9 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) \ - cspr2.$(SUFFIX) \ + csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + cspr.$(SUFFIX) cspr2.$(SUFFIX) \ + csymv.$(SUFFIX) csyr.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ chemv.$(SUFFIX) chbmv.$(SUFFIX) \ @@ -105,7 +106,7 @@ CBLAS3OBJS = \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) + cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -121,8 +122,9 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) \ - zspr2.$(SUFFIX) \ + zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zspr.$(SUFFIX) zspr2.$(SUFFIX) \ + zsymv.$(SUFFIX) zsyr.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ @@ -134,7 +136,7 @@ ZBLAS3OBJS = \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) + zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -281,12 +283,12 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -306,7 +308,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -331,7 +333,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -362,7 +364,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -1300,6 +1302,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h @@ -1320,6 +1324,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1907,6 +1929,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + +cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemmt.c b/interface/gemmt.c new file mode 100644 index 0000000000..3eed1dfe49 --- /dev/null +++ b/interface/gemmt.c @@ -0,0 +1,589 @@ +/*********************************************************************/ +/* Copyright 2022, The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536.0 +#ifdef XDOUBLE +#define ERROR_NAME "QGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMT " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMT " +#else +#define ERROR_NAME "SGEMT " +#endif +#else +#define SMP_THRESHOLD_MIN 8192.0 +#ifdef XDOUBLE +#define ERROR_NAME "XGEMT " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMT " +#else +#define ERROR_NAME "CGEMT " +#endif +#endif + +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANSA, char *TRANSB, + blasint * M, blasint * N, blasint * K, + FLOAT * Alpha, + IFLOAT * a, blasint * ldA, + IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) +{ + + blasint m, n, k; + blasint lda, ldb, ldc; + int transa, transb, uplo; + blasint info; + + char transA, transB, Uplo; + IFLOAT *buffer; + IFLOAT *aa, *bb; + FLOAT *cc; +#if defined(COMPLEX) + FLOAT alpha_r, alpha_i, beta_r, beta_i; +#else + FLOAT alpha, beta; +#endif + + PRINT_DEBUG_NAME; + + m = *M; + n = *N; + k = *K; + +#if defined(COMPLEX) + FLOAT *alpha = Alpha; + alpha_r = *(Alpha + 0); + alpha_i = *(Alpha + 1); + + beta_r = *(Beta + 0); + beta_i = *(Beta + 1); +#else + alpha = *Alpha; + beta = *Beta; +#endif + + lda = *ldA; + ldb = *ldB; + ldc = *ldC; + + transA = *TRANSA; + transB = *TRANSB; + Uplo = *UPLO; + TOUPPER(transA); + TOUPPER(transB); + TOUPPER(Uplo); + + transa = -1; + transb = -1; + uplo = -1; + + if (transA == 'N') + transa = 0; + if (transA == 'T') + transa = 1; +#ifndef COMPLEX + if (transA == 'R') + transa = 0; + if (transA == 'C') + transa = 1; +#else + if (transA == 'R') + transa = 2; + if (transA == 'C') + transa = 3; +#endif + + if (transB == 'N') + transb = 0; + if (transB == 'T') + transb = 1; +#ifndef COMPLEX + if (transB == 'R') + transb = 0; + if (transB == 'C') + transb = 1; +#else + if (transB == 'R') + transb = 2; + if (transB == 'C') + transb = 3; +#endif + + if (Uplo == 'U') + uplo = 0; + if (Uplo == 'L') + uplo = 1; + + info = 0; + + if (uplo < 0) + info = 14; + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + if (info) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, + blasint N, blasint k, +#ifndef COMPLEX + FLOAT alpha, + IFLOAT * A, blasint LDA, + IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) +{ +#else + void *valpha, + void *va, blasint LDA, + void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc) +{ + FLOAT *alpha = (FLOAT *) valpha; + FLOAT *beta = (FLOAT *) vbeta; + FLOAT *A = (FLOAT *) va; + FLOAT *B = (FLOAT *) vb; + FLOAT *c = (FLOAT *) vc; +#endif + FLOAT *aa, *bb, *cc; + + int transa, transb, uplo; + blasint info; + blasint m, n, lda, ldb; + FLOAT *a, *b; + XFLOAT *buffer; + + PRINT_DEBUG_CNAME; + + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + + if (TransA == CblasNoTrans) + transa = 0; + if (TransA == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transa = 0; + if (TransA == CblasConjTrans) + transa = 1; +#else + if (TransA == CblasConjNoTrans) + transa = 2; + if (TransA == CblasConjTrans) + transa = 3; +#endif + if (TransB == CblasNoTrans) + transb = 0; + if (TransB == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transb = 0; + if (TransB == CblasConjTrans) + transb = 1; +#else + if (TransB == CblasConjNoTrans) + transb = 2; + if (TransB == CblasConjTrans) + transb = 3; +#endif + + m = M; + n = N; + + a = (void *)A; + b = (void *)B; + lda = LDA; + ldb = LDB; + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + } + + if (order == CblasRowMajor) { + m = N; + n = M; + + a = (void *)B; + b = (void *)A; + + lda = LDB; + ldb = LDA; + + if (TransB == CblasNoTrans) + transa = 0; + if (TransB == CblasTrans) + transa = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) + transa = 0; + if (TransB == CblasConjTrans) + transa = 1; +#else + if (TransB == CblasConjNoTrans) + transa = 2; + if (TransB == CblasConjTrans) + transa = 3; +#endif + if (TransA == CblasNoTrans) + transb = 0; + if (TransA == CblasTrans) + transb = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) + transb = 0; + if (TransA == CblasConjTrans) + transb = 1; +#else + if (TransA == CblasConjNoTrans) + transb = 2; + if (TransA == CblasConjTrans) + transb = 3; +#endif + + info = -1; + + if (ldc < m) + info = 13; + if (k < 0) + info = 5; + if (n < 0) + info = 4; + if (m < 0) + info = 3; + if (transb < 0) + info = 2; + if (transa < 0) + info = 1; + + } + + uplo = -1; + if (Uplo == CblasUpper) + uplo = 0; + if (Uplo == CblasLower) + uplo = 1; + if (uplo < 0) + info = 14; + + if (info >= 0) { + BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } +#if defined(COMPLEX) + FLOAT alpha_r = *(alpha + 0); + FLOAT alpha_i = *(alpha + 1); + + FLOAT beta_r = *(beta + 0); + FLOAT beta_i = *(beta + 1); +#endif + +#endif + int buffer_size; + blasint l; + blasint i, j; + +#ifdef SMP + int nthreads; +#endif + +#if defined(COMPLEX) + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, + xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, + xgemv_thread_d, +#elif defined DOUBLE + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, + zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, + zgemv_thread_d, +#else + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, + cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, + cgemv_thread_d, +#endif + }; +#endif + + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, + FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,}; + +#else + +#ifdef SMP + static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *, + BLASLONG, FLOAT *, BLASLONG, FLOAT *, + BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgemv_thread_n, qgemv_thread_t, +#elif defined DOUBLE + dgemv_thread_n, dgemv_thread_t, +#else + sgemv_thread_n, sgemv_thread_t, +#endif + }; +#endif + int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T,}; + +#endif + + if ((m == 0) || (n == 0)) + return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + const blasint incb = (transb == 0) ? 1 : ldb; + + if (uplo == 1) { + for (i = 0; i < n; i++) { + j = n - i; + + l = j; +#if defined(COMPLEX) + aa = a + i * 2; + bb = b + i * ldb * 2; + if (transa) { + l = k; + aa = a + lda * i * 2; + bb = b + i * 2; + } + cc = c + i * 2 * ldc + i * 2; +#else + aa = a + i; + bb = b + i * ldb; + if (transa) { + l = k; + aa = a + lda * i; + bb = b + i; + } + cc = c + i * ldc + i; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + aa, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, + bb, incb, cc, 1, buffer); +#endif +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, aa, + lda, bb, incb, cc, + 1, buffer, + nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } else { + + for (i = 0; i < n; i++) { + j = i + 1; + + l = j; +#if defined COMPLEX + bb = b + i * ldb * 2; + if (transa) { + l = k; + bb = b + i * 2; + } + cc = c + i * 2 * ldc; +#else + bb = b + i * ldb; + if (transa) { + l = k; + bb = b + i; + } + cc = c + i * ldc; +#endif + +#if defined(COMPLEX) + if (beta_r != ONE || beta_i != ZERO) + SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, + NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) + return; +#else + if (beta != ONE) + SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); + + if (alpha == ZERO) + continue; +#endif + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer_size = j + k + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT); +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#ifdef SMP + + if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 1; + else + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if defined(COMPLEX) + (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, + a, lda, bb, incb, cc, 1, + buffer); +#else + (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, + incb, cc, 1, buffer); +#endif + +#ifdef SMP + } else { + + (gemv_thread[(int)transa]) (j, k, alpha, a, lda, + bb, incb, cc, 1, + buffer, nthreads); + + } +#endif + + STACK_FREE(buffer); + } + } + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + args.m * args.k + args.k * args.n + + args.m * args.n, 2 * args.m * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/symm.c b/interface/symm.c index 0e29a5f482..3e65e69b13 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -44,6 +44,7 @@ #endif #ifndef COMPLEX +#define SMP_THRESHOLD_MIN 65536. #ifdef XDOUBLE #define ERROR_NAME "QSYMM " #elif defined(DOUBLE) @@ -52,6 +53,7 @@ #define ERROR_NAME "SSYMM " #endif #else +#define SMP_THRESHOLD_MIN 8192. #ifndef GEMM3M #ifndef HEMM #ifdef XDOUBLE @@ -91,6 +93,10 @@ #endif #endif +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + #ifdef SMP #ifndef COMPLEX @@ -159,7 +165,9 @@ void NAME(char *SIDE, char *UPLO, #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif - +# if defined(SMP) + int MN; +#endif blasint info; int side; int uplo; @@ -255,6 +263,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif +#if defined(SMP) + int MN; +#endif PRINT_DEBUG_CNAME; @@ -375,15 +386,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(3); - + MN = 2.* (double) args.m * (double)args.m * (double) args.n; + if (MN <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { + args.nthreads = 1; + } else { + args.nthreads = num_cpu_avail(3); + } if (args.nthreads == 1) { #endif (symm[(side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP - } else { #ifndef NO_AFFINITY diff --git a/interface/symv.c b/interface/symv.c index 07bd200227..1f23ce4ee5 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -180,6 +180,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP + if (n <200) + nthreads=1; + else nthreads = num_cpu_avail(2); if (nthreads == 1) { diff --git a/interface/syr2k.c b/interface/syr2k.c index a72330c0bd..47df7f89f0 100644 --- a/interface/syr2k.c +++ b/interface/syr2k.c @@ -368,6 +368,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr mode |= (uplo << BLAS_UPLO_SHIFT); args.common = NULL; + if (args.n*args.k <1000) + args.nthreads =1 ; + else args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { diff --git a/interface/syrk.c b/interface/syrk.c index edb113d6cf..3b056aec8b 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -44,6 +44,7 @@ #endif #ifndef COMPLEX +#define SMP_THRESHOLD_MIN 109944. #ifdef XDOUBLE #define ERROR_NAME "QSYRK " #elif defined(DOUBLE) @@ -52,6 +53,7 @@ #define ERROR_NAME "SSYRK " #endif #else +#define SMP_THRESHOLD_MIN 14824. #ifndef HEMM #ifdef XDOUBLE #define ERROR_NAME "XSYRK " @@ -71,6 +73,10 @@ #endif #endif +#ifndef GEMM_MULTITHREAD_THRESHOLD +#define GEMM_MULTITHREAD_THRESHOLD 4 +#endif + static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef HEMM SYRK_UN, SYRK_UC, SYRK_LN, SYRK_LC, @@ -101,6 +107,7 @@ void NAME(char *UPLO, char *TRANS, FLOAT *sa, *sb; #ifdef SMP + int NNK; #ifdef USE_SIMPLE_THREADED_LEVEL3 #ifndef COMPLEX #ifdef XDOUBLE @@ -225,6 +232,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr FLOAT *sa, *sb; #ifdef SMP +int NNK; + #ifdef USE_SIMPLE_THREADED_LEVEL3 #ifndef COMPLEX #ifdef XDOUBLE @@ -354,18 +363,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif args.common = NULL; -#ifndef COMPLEX -#ifdef DOUBLE - if (args.n < 100) -#else - if (args.n < 200) -#endif -#else - if (args.n < 65) -#endif + + NNK = (double)(args.n+1)*(double)args.n*(double)args.k; + if (NNK <= (SMP_THRESHOLD_MIN * GEMM_MULTITHREAD_THRESHOLD)) { args.nthreads = 1; - else + } else { args.nthreads = num_cpu_avail(3); + } if (args.nthreads == 1) { #endif @@ -373,7 +377,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr (syrk[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP - } else { #ifndef USE_SIMPLE_THREADED_LEVEL3 diff --git a/interface/zsyr.c b/interface/zsyr.c index 54fb8a4e9d..8bc9ac1777 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], x, 1, a, 1, NULL, 0); } - a += lda; + a += lda * 2; } } else { for (i = 0; i < n; i++){ @@ -191,7 +191,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], x + i * 2, 1, a, 1, NULL, 0); } - a += 2 + lda; + a += 2 + lda * 2; } } return; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index ddc7beb984..60314eedb5 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -237,8 +237,74 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (DGEMMOTCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE") endif () - GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") - endif () + GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") + GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") + GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") + if (SMALL_MATRIX_OPT) + if (NOT DEFINED DGEMM_SMALL_M_PERMIT) + set(DGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_NN) + set(DGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_NT) + set(DGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_TN) + set(DGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_TT) + set(DGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_NN) + set(DGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_NT) + set(DGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_TN) + set(DGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_TT) + set(DGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false "DOUBLE") + endif () + + endif () if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "SINGLE") if (SGEMMINCOPY) @@ -825,7 +891,7 @@ endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () - if (BUILD_DOUBLE AND NOT BUILD_SINGLE) + if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "SINGLE") @@ -849,6 +915,45 @@ endif () GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false "SINGLE") GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false "SINGLE") GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false "SINGLE") + if (SMALL_MATRIX_OPT) + if (NOT DEFINED SGEMM_SMALL_M_PERMIT) + set(SGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_NN) + set(SGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_NT) + set(SGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_TN) + set(SGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_TT) + set(SGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_B0_NN) + set(SGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_B0_NT) + set(SGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_B0_TN) + set(SGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SGEMM_SMALL_K_B0_TT) + set(SGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "SINGLE") + endif () + endif () # Makefile.LA @@ -878,25 +983,25 @@ endif () endforeach() if (BUILD_COMPLEX AND NOT BUILD_SINGLE) if (NOT DEFINED SNEG_TCOPY) - set(SNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c) + set(SNEG_TCOPY ../generic/neg_tcopy_${SGEMM_UNROLL_M}.c) endif () if (NOT DEFINED SLASWP_NCOPY) - set(SLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c) + set(SLASWP_NCOPY ../generic/laswp_ncopy_${SGEMM_UNROLL_N}.c) endif () GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}" "" "neg_tcopy" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "SINGLE") endif() if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) if (NOT DEFINED DNEG_TCOPY) - set(DNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c) + set(DNEG_TCOPY ../generic/neg_tcopy_${DGEMM_UNROLL_M}.c) endif () if (NOT DEFINED DLASWP_NCOPY) - set(DLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c) + set(DLASWP_NCOPY ../generic/laswp_ncopy_${DGEMM_UNROLL_N}.c) endif () - GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" "" false "DOUBLE") - GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}" "" "neg_tcopy" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "DOUBLE") endif() endif() @@ -979,10 +1084,117 @@ endif () endif () if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE") + if (DEFINED DMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k" false "" "" false "DOUBLE") + endif () + if (DEFINED DMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "USE_MIN" "min_k" false "" "" false "DOUBLE") + endif () + if (DEFINED IDMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "DOUBLE") + endif () + if (DEFINED IDMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k" false "" "" false "DOUBLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") + + GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if (DGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "${DGEMMITCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "${DGEMMONCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") + GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") + if (SMALL_MATRIX_OPT) + if (NOT DEFINED DGEMM_SMALL_M_PERMIT) + set(DGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_NN) + set(DGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_NT) + set(DGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_TN) + set(DGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_TT) + set(DGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_NN) + set(DGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_NT) + set(DGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_TN) + set(DGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED DGEMM_SMALL_K_B0_TT) + set(DGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false "DOUBLE") + endif () endif () - if (BUILD_COMPLEX16 AND NOT BUILD_COMPLEX) + if (BUILD_COMPLEX16 AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE") + endif() + if (BUILD_COMPLEX160 AND NOT BUILD_COMPLEX) GenerateNamedObjects("${KERNELDIR}/${CAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "COMPLEX") GenerateNamedObjects("${KERNELDIR}/${CAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "COMPLEX") if (DEFINED CMAXKERNEL) @@ -1046,7 +1258,69 @@ endif () if (CGEMMOTCOPY) GenerateNamedObjects("${KERNELDIR}/${CGEMMOTCOPY}" "COMPLEX" "${CGEMMOTCOPYOBJ}" false "" "" true "COMPLEX") endif () - GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX") + if (SMALL_MATRIX_OPT) + if (NOT DEFINED CGEMM_SMALL_M_PERMIT) + set(CGEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit) + endif () + if (NOT DEFINED CGEMM_SMALL_K_NN) + set(CGEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn) + endif () + if (NOT DEFINED CGEMM_SMALL_K_NT) + set(CGEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt) + endif () + if (NOT DEFINED CGEMM_SMALL_K_TN) + set(CGEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn) + endif () + if (NOT DEFINED CGEMM_SMALL_K_TT) + set(CGEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt) + endif () + if (NOT DEFINED CGEMM_SMALL_K_B0_NN) + set(CGEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn) + endif () + if (NOT DEFINED CGEMM_SMALL_K_B0_NT) + set(CGEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt) + endif () + if (NOT DEFINED CGEMM_SMALL_K_B0_TN) + set(CGEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn) + endif () + if (NOT DEFINED CGEMM_SMALL_K_B0_TT) + set(CGEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt) + endif () + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_M_PERMIT}.c" "" "gemm_small_matrix_permit" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "NN" "gemm_small_kernel_nn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "NR" "gemm_small_kernel_nr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "RN" "gemm_small_kernel_rn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "RR" "gemm_small_kernel_rr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "NT" "gemm_small_kernel_nt" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "NC" "gemm_small_kernel_nc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "RT" "gemm_small_kernel_rt" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "RC" "gemm_small_kernel_rc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "TN" "gemm_small_kernel_tn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "TR" "gemm_small_kernel_tr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "CN" "gemm_small_kernel_cn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "CR" "gemm_small_kernel_cr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "TT" "gemm_small_kernel_tt" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "TC" "gemm_small_kernel_tc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "CT" "gemm_small_kernel_ct" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "CC" "gemm_small_kernel_cc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false "COMPLEX") + endif () GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "COMPLEX") GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "COMPLEX") GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "COMPLEX") diff --git a/kernel/Makefile b/kernel/Makefile index cbe4cde6e3..977886044a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) - AVX2OPT = -mavx2 + AVX2OPT = -mavx2 -mfma endif endif ifdef NO_AVX2 @@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), ZEN) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else ifeq ($(TARGET_CORE), LOONGSON3R4) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index ac53c29c34..0332ba722e 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -207,9 +207,12 @@ ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" SBLASOBJS += \ sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) endif +ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE))" "" +SBLASOBJS += \ + ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) +endif ifeq ($(BUILD_SINGLE),1) SBLASOBJS += \ - ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ sger_k$(TSUFFIX).$(SUFFIX) endif ifeq ($(BUILD_DOUBLE),1) @@ -359,8 +362,7 @@ $(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNE $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ -ifeq ($(BUILD_SINGLE),1) - +ifneq "$(or (BUILD_SINGLE),$(BUILD_DOUBLE))" "" $(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ diff --git a/kernel/alpha/amax.S b/kernel/alpha/amax.S index e528adc072..88635e8ec7 100644 --- a/kernel/alpha/amax.S +++ b/kernel/alpha/amax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/asum.S b/kernel/alpha/asum.S index b312d064b9..54725b5cc6 100644 --- a/kernel/alpha/asum.S +++ b/kernel/alpha/asum.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/axpy.S b/kernel/alpha/axpy.S index 1007b063b6..403b89df14 100644 --- a/kernel/alpha/axpy.S +++ b/kernel/alpha/axpy.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 40 diff --git a/kernel/alpha/cabs.S b/kernel/alpha/cabs.S index 5fa27af53e..79b92836b7 100644 --- a/kernel/alpha/cabs.S +++ b/kernel/alpha/cabs.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + .set noat .set noreorder diff --git a/kernel/alpha/cnrm2.S b/kernel/alpha/cnrm2.S index bd1ab8782d..445eaa7ea6 100644 --- a/kernel/alpha/cnrm2.S +++ b/kernel/alpha/cnrm2.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCH_SIZE 80 diff --git a/kernel/alpha/copy.S b/kernel/alpha/copy.S index 749039c9ea..315a02b1e7 100644 --- a/kernel/alpha/copy.S +++ b/kernel/alpha/copy.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/cscal.S b/kernel/alpha/cscal.S index bba3137a9b..a09306a1c5 100644 --- a/kernel/alpha/cscal.S +++ b/kernel/alpha/cscal.S @@ -42,7 +42,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + .globl NAME .ent NAME diff --git a/kernel/alpha/dnrm2.S b/kernel/alpha/dnrm2.S index 0dfb64924b..c71a8e3c97 100644 --- a/kernel/alpha/dnrm2.S +++ b/kernel/alpha/dnrm2.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCH_SIZE 80 diff --git a/kernel/alpha/dot.S b/kernel/alpha/dot.S index 330196c78f..fe84c719f0 100644 --- a/kernel/alpha/dot.S +++ b/kernel/alpha/dot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/gemm_beta.S b/kernel/alpha/gemm_beta.S index 44b2fada16..e234a3216f 100644 --- a/kernel/alpha/gemm_beta.S +++ b/kernel/alpha/gemm_beta.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + .set noat .set noreorder diff --git a/kernel/alpha/gemm_kernel_4x4.S b/kernel/alpha/gemm_kernel_4x4.S index c55d817df4..8fda1ab5a4 100644 --- a/kernel/alpha/gemm_kernel_4x4.S +++ b/kernel/alpha/gemm_kernel_4x4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/gemv_n.S b/kernel/alpha/gemv_n.S index 3e9d1d7fb3..0fcd5b8655 100644 --- a/kernel/alpha/gemv_n.S +++ b/kernel/alpha/gemv_n.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define STACKSIZE 64 #define PREFETCHSIZE 32 diff --git a/kernel/alpha/gemv_t.S b/kernel/alpha/gemv_t.S index ea95546e87..f9432486f9 100644 --- a/kernel/alpha/gemv_t.S +++ b/kernel/alpha/gemv_t.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define STACKSIZE 64 #define PREFETCHSIZE 32 diff --git a/kernel/alpha/iamax.S b/kernel/alpha/iamax.S index 2be5d5d08c..384df07e6b 100644 --- a/kernel/alpha/iamax.S +++ b/kernel/alpha/iamax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/imax.S b/kernel/alpha/imax.S index d8958c86a2..7857510757 100644 --- a/kernel/alpha/imax.S +++ b/kernel/alpha/imax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/izamax.S b/kernel/alpha/izamax.S index c932581aee..d85b909e1c 100644 --- a/kernel/alpha/izamax.S +++ b/kernel/alpha/izamax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/lsame.S b/kernel/alpha/lsame.S index 082f790829..b1a7d5b706 100644 --- a/kernel/alpha/lsame.S +++ b/kernel/alpha/lsame.S @@ -36,7 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include "version.h" + .set noat .set noreorder diff --git a/kernel/alpha/max.S b/kernel/alpha/max.S index af1b8fb850..935f277184 100644 --- a/kernel/alpha/max.S +++ b/kernel/alpha/max.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/rot.S b/kernel/alpha/rot.S index d1656d7e3e..7a0991015d 100644 --- a/kernel/alpha/rot.S +++ b/kernel/alpha/rot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/scal.S b/kernel/alpha/scal.S index 2d95801c83..db69595208 100644 --- a/kernel/alpha/scal.S +++ b/kernel/alpha/scal.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/snrm2.S b/kernel/alpha/snrm2.S index 0dfb64924b..c71a8e3c97 100644 --- a/kernel/alpha/snrm2.S +++ b/kernel/alpha/snrm2.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCH_SIZE 80 diff --git a/kernel/alpha/sum.S b/kernel/alpha/sum.S index 3902817a70..adc4ca5a11 100644 --- a/kernel/alpha/sum.S +++ b/kernel/alpha/sum.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/swap.S b/kernel/alpha/swap.S index 9e21990c44..34e58a72ad 100644 --- a/kernel/alpha/swap.S +++ b/kernel/alpha/swap.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + PROLOGUE PROFCODE diff --git a/kernel/alpha/trsm_kernel_4x4_LN.S b/kernel/alpha/trsm_kernel_4x4_LN.S index 600b4e255d..be5062244f 100644 --- a/kernel/alpha/trsm_kernel_4x4_LN.S +++ b/kernel/alpha/trsm_kernel_4x4_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/trsm_kernel_4x4_LT.S b/kernel/alpha/trsm_kernel_4x4_LT.S index 81436d0342..dfc7e98aaf 100644 --- a/kernel/alpha/trsm_kernel_4x4_LT.S +++ b/kernel/alpha/trsm_kernel_4x4_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/trsm_kernel_4x4_RT.S b/kernel/alpha/trsm_kernel_4x4_RT.S index 71d6c43fad..d77ccc61bd 100644 --- a/kernel/alpha/trsm_kernel_4x4_RT.S +++ b/kernel/alpha/trsm_kernel_4x4_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/zamax.S b/kernel/alpha/zamax.S index f1ea18d2df..96502a7a93 100644 --- a/kernel/alpha/zamax.S +++ b/kernel/alpha/zamax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/zasum.S b/kernel/alpha/zasum.S index 67ed785846..37a1c234a7 100644 --- a/kernel/alpha/zasum.S +++ b/kernel/alpha/zasum.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/zaxpy.S b/kernel/alpha/zaxpy.S index 1416769a15..1494c7fc02 100644 --- a/kernel/alpha/zaxpy.S +++ b/kernel/alpha/zaxpy.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 40 diff --git a/kernel/alpha/zdot.S b/kernel/alpha/zdot.S index 78dcae6681..724526407f 100644 --- a/kernel/alpha/zdot.S +++ b/kernel/alpha/zdot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/zgemm_beta.S b/kernel/alpha/zgemm_beta.S index f7ca347f13..fcabe48d09 100644 --- a/kernel/alpha/zgemm_beta.S +++ b/kernel/alpha/zgemm_beta.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + .set noat .set noreorder diff --git a/kernel/alpha/zgemm_kernel_2x2.S b/kernel/alpha/zgemm_kernel_2x2.S index 67ba6d1087..e56a3e10d6 100644 --- a/kernel/alpha/zgemm_kernel_2x2.S +++ b/kernel/alpha/zgemm_kernel_2x2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/zgemv_n.S b/kernel/alpha/zgemv_n.S index fd602a3eb2..2ebb918d57 100644 --- a/kernel/alpha/zgemv_n.S +++ b/kernel/alpha/zgemv_n.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define STACKSIZE 64 #define PREFETCHSIZE 32 diff --git a/kernel/alpha/zgemv_t.S b/kernel/alpha/zgemv_t.S index bac56eb3fe..96d8caa274 100644 --- a/kernel/alpha/zgemv_t.S +++ b/kernel/alpha/zgemv_t.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define STACKSIZE 64 #define PREFETCHSIZE 32 diff --git a/kernel/alpha/znrm2.S b/kernel/alpha/znrm2.S index bd1ab8782d..445eaa7ea6 100644 --- a/kernel/alpha/znrm2.S +++ b/kernel/alpha/znrm2.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCH_SIZE 80 diff --git a/kernel/alpha/zrot.S b/kernel/alpha/zrot.S index afcdf12b4d..61fe4f3d9c 100644 --- a/kernel/alpha/zrot.S +++ b/kernel/alpha/zrot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define N $16 #define X $17 diff --git a/kernel/alpha/zscal.S b/kernel/alpha/zscal.S index 1a2ac10b32..bed3033f8f 100644 --- a/kernel/alpha/zscal.S +++ b/kernel/alpha/zscal.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/zsum.S b/kernel/alpha/zsum.S index 1ad0eb137d..5c51bbc6f2 100644 --- a/kernel/alpha/zsum.S +++ b/kernel/alpha/zsum.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #define PREFETCHSIZE 88 diff --git a/kernel/alpha/zswap.S b/kernel/alpha/zswap.S index a12a2c7a73..02be94115a 100644 --- a/kernel/alpha/zswap.S +++ b/kernel/alpha/zswap.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + PROLOGUE PROFCODE diff --git a/kernel/alpha/ztrsm_kernel_2x2_LN.S b/kernel/alpha/ztrsm_kernel_2x2_LN.S index dcbe4e2365..44d46daa72 100644 --- a/kernel/alpha/ztrsm_kernel_2x2_LN.S +++ b/kernel/alpha/ztrsm_kernel_2x2_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/ztrsm_kernel_2x2_LT.S b/kernel/alpha/ztrsm_kernel_2x2_LT.S index e0c82026e6..f17987faf2 100644 --- a/kernel/alpha/ztrsm_kernel_2x2_LT.S +++ b/kernel/alpha/ztrsm_kernel_2x2_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/alpha/ztrsm_kernel_2x2_RT.S b/kernel/alpha/ztrsm_kernel_2x2_RT.S index e890f599d6..90b56c9542 100644 --- a/kernel/alpha/ztrsm_kernel_2x2_RT.S +++ b/kernel/alpha/ztrsm_kernel_2x2_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" -#include "version.h" + #if !defined(EV4) && !defined(EV5) && !defined(EV6) #error "Architecture is not specified." diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index ea010db426..9a5938459b 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index 07a94a0435..cabacad46e 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S @@ -190,10 +190,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMM_BETA = sbgemm_beta_neoversen2.c SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c -SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c -SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c -SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c -SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c +SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversen2.c +SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c +SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c +SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversen2.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index ea010db426..9a5938459b 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a20d0d4a6d..41cedc8519 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index a20d0d4a6d..41cedc8519 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c -DDOTKERNEL = dot_thunderx2t99.c -SDOTKERNEL = dot_thunderx2t99.c +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S index 023d5ba92b..d978a3315b 100644 --- a/kernel/arm64/dgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -189,20 +189,16 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ ld1rd z11.d, p0/z, [pB, 24] fmla z24.d, p0/m, z0.d, z12.d fmla z25.d, p0/m, z1.d, z12.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z26.d, p0/m, z0.d, z13.d fmla z27.d, p0/m, z1.d, z13.d - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] ld1rd z13.d, p0/z, [pB, 40] fmla z28.d, p0/m, z0.d, z14.d fmla z29.d, p0/m, z1.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z30.d, p0/m, z0.d, z15.d fmla z31.d, p0/m, z1.d, z15.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] add pB, pB, 64 .endm @@ -227,19 +223,15 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ ld1rd z11.d, p0/z, [pB, 24] fmla z24.d, p0/m, z0.d, z12.d fmla z25.d, p0/m, z1.d, z12.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z26.d, p0/m, z0.d, z13.d fmla z27.d, p0/m, z1.d, z13.d - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] ld1rd z13.d, p0/z, [pB, 40] fmla z28.d, p0/m, z0.d, z14.d fmla z29.d, p0/m, z1.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z30.d, p0/m, z0.d, z15.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] fmla z31.d, p0/m, z1.d, z15.d - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -265,7 +257,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ ld1rd z11.d, p0/z, [pB, 24] fmla z24.d, p0/m, z2.d, z12.d fmla z25.d, p0/m, z3.d, z12.d - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z26.d, p0/m, z2.d, z13.d fmla z27.d, p0/m, z3.d, z13.d @@ -291,7 +282,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z23.d, p0/m, z3.d, z11.d fmla z24.d, p0/m, z2.d, z12.d fmla z25.d, p0/m, z3.d, z12.d - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z26.d, p0/m, z2.d, z13.d fmla z27.d, p0/m, z3.d, z13.d fmla z28.d, p0/m, z2.d, z14.d @@ -322,25 +312,21 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z18.d, p0/m, z0.d, z9.d fmla z19.d, p0/m, z1.d, z9.d fmla z20.d, p0/m, z0.d, z10.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z21.d, p0/m, z1.d, z10.d fmla z22.d, p0/m, z0.d, z11.d fmla z23.d, p0/m, z1.d, z11.d fmla z24.d, p0/m, z0.d, z12.d - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] fmla z25.d, p0/m, z1.d, z12.d fmla z26.d, p0/m, z0.d, z13.d fmla z27.d, p0/m, z1.d, z13.d fmla z28.d, p0/m, z0.d, z14.d fmla z29.d, p0/m, z1.d, z14.d - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z30.d, p0/m, z0.d, z15.d fmla z31.d, p0/m, z1.d, z15.d .endm .macro SAVEv2x8 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z8.d, p0/z, [pCRow0] @@ -349,7 +335,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.d, p0/m, z17.d, alphaZ st1d z8.d, p0, [pCRow0] st1d z9.d, p0, [pCRow0, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z10.d, p0/z, [pCRow1] @@ -358,7 +343,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.d, p0/m, z19.d, alphaZ st1d z10.d, p0, [pCRow1] st1d z11.d, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z12.d, p0/z, [pCRow2] @@ -367,7 +351,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z13.d, p0/m, z21.d, alphaZ st1d z12.d, p0, [pCRow2] st1d z13.d, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z14.d, p0/z, [pCRow1] @@ -376,7 +359,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z15.d, p0/m, z23.d, alphaZ st1d z14.d, p0, [pCRow1] st1d z15.d, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z8.d, p0/z, [pCRow2] @@ -385,7 +367,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.d, p0/m, z25.d, alphaZ st1d z8.d, p0, [pCRow2] st1d z9.d, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z10.d, p0/z, [pCRow1] @@ -394,7 +375,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.d, p0/m, z27.d, alphaZ st1d z10.d, p0, [pCRow1] st1d z11.d, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z12.d, p0/z, [pCRow2] @@ -403,7 +383,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z13.d, p0/m, z29.d, alphaZ st1d z12.d, p0, [pCRow2] st1d z13.d, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1d z14.d, p0/z, [pCRow1] ld1d z15.d, p0/z, [pCRow1, #1, mul vl] @@ -443,10 +422,8 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.d, p0/m, z0.d, z8.d fmla z17.d, p0/m, z1.d, z8.d fmla z18.d, p0/m, z0.d, z9.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z19.d, p0/m, z1.d, z9.d fmla z20.d, p0/m, z0.d, z10.d - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] fmla z21.d, p0/m, z1.d, z10.d fmla z22.d, p0/m, z0.d, z11.d fmla z23.d, p0/m, z1.d, z11.d @@ -454,7 +431,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .macro SAVEv2x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z8.d, p0/z, [pCRow0] @@ -463,7 +439,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.d, p0/m, z17.d, alphaZ st1d z8.d, p0, [pCRow0] st1d z9.d, p0, [pCRow0, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z10.d, p0/z, [pCRow1] @@ -472,7 +447,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.d, p0/m, z19.d, alphaZ st1d z10.d, p0, [pCRow1] st1d z11.d, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z12.d, p0/z, [pCRow2] @@ -481,7 +455,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z13.d, p0/m, z21.d, alphaZ st1d z12.d, p0, [pCRow2] st1d z13.d, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1d z14.d, p0/z, [pCRow1] ld1d z15.d, p0/z, [pCRow1, #1, mul vl] @@ -514,15 +487,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.d, p0/m, z0.d, z8.d fmla z17.d, p0/m, z1.d, z8.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z18.d, p0/m, z0.d, z9.d fmla z19.d, p0/m, z1.d, z9.d - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] .endm .macro SAVEv2x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z8.d, p0/z, [pCRow0] @@ -531,7 +501,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.d, p0/m, z17.d, alphaZ st1d z8.d, p0, [pCRow0] st1d z9.d, p0, [pCRow0, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1d z10.d, p0/z, [pCRow1] ld1d z11.d, p0/z, [pCRow1, #1, mul vl] @@ -539,7 +508,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.d, p0/m, z19.d, alphaZ st1d z10.d, p0, [pCRow1] st1d z11.d, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] @@ -563,12 +531,10 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.d, p0/m, z0.d, z8.d fmla z17.d, p0/m, z1.d, z8.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] .endm .macro SAVEv2x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z8.d, p0/z, [pCRow0] @@ -618,14 +584,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z19.d, p1/m, z0.d, z11.d ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z0.d, z12.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z21.d, p1/m, z0.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z0.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z23.d, p1/m, z0.d, z15.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -644,14 +608,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z19.d, p1/m, z0.d, z11.d ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z0.d, z12.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z21.d, p1/m, z0.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z0.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z23.d, p1/m, z0.d, z15.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -671,7 +633,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z1.d, z12.d ld1rd z12.d, p0/z, [pB, 32] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.d, p1/m, z1.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z1.d, z14.d @@ -688,7 +649,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z18.d, p1/m, z1.d, z10.d fmla z19.d, p1/m, z1.d, z11.d fmla z20.d, p1/m, z1.d, z12.d - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.d, p1/m, z1.d, z13.d fmla z22.d, p1/m, z1.d, z14.d fmla z23.d, p1/m, z1.d, z15.d @@ -712,11 +672,9 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.d, p1/m, z0.d, z8.d fmla z17.d, p1/m, z0.d, z9.d fmla z18.d, p1/m, z0.d, z10.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z19.d, p1/m, z0.d, z11.d fmla z20.d, p1/m, z0.d, z12.d fmla z21.d, p1/m, z0.d, z13.d - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z22.d, p1/m, z0.d, z14.d fmla z23.d, p1/m, z0.d, z15.d @@ -725,49 +683,41 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .macro SAVEv1x8 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z26.d, p1/z, [pCRow2] fmla z26.d, p1/m, z18.d, alphaZ st1d z26.d, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z27.d, p1/z, [pCRow1] fmla z27.d, p1/m, z19.d, alphaZ st1d z27.d, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z28.d, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaZ st1d z28.d, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z29.d, p1/z, [pCRow1] fmla z29.d, p1/m, z21.d, alphaZ st1d z29.d, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z30.d, p1/z, [pCRow2] fmla z30.d, p1/m, z22.d, alphaZ st1d z30.d, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1d z31.d, p1/z, [pCRow1] fmla z31.d, p1/m, z23.d, alphaZ @@ -799,7 +749,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.d, p1/m, z0.d, z8.d fmla z17.d, p1/m, z0.d, z9.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z18.d, p1/m, z0.d, z10.d fmla z19.d, p1/m, z0.d, z11.d @@ -807,25 +756,21 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z26.d, p1/z, [pCRow2] fmla z26.d, p1/m, z18.d, alphaZ st1d z26.d, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1d z27.d, p1/z, [pCRow1] fmla z27.d, p1/m, z19.d, alphaZ @@ -852,20 +797,17 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ add pB, pB, 16 fmla z16.d, p1/m, z0.d, z8.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z17.d, p1/m, z0.d, z9.d .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ @@ -890,13 +832,11 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ add pB, pB, 8 fmla z16.d, p1/m, z0.d, z8.d - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ @@ -928,8 +868,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] fmov alpha, d0 dup alphaZ, alpha @@ -968,7 +906,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ /* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ mul temp, vec_len, origK // generate address of pA2 add pA2, pA1, temp, lsl #3 // pA1 = start of A array - prfm PLDL1KEEP, [pA2] .align 5 .Ldgemm_kernel_L8_Mv2_20: @@ -1057,11 +994,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ bne .Ldgemm_kernel_L8_Mv2_46 .Ldgemm_kernel_L8_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [pA2] - prfm PLDL1KEEP, [pA2, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x8 mov pA1, pA2 // pA1 = pA2 @@ -1171,9 +1103,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ bne .Ldgemm_kernel_L8_Mv1_46 .Ldgemm_kernel_L8_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x8 @@ -1233,16 +1162,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L4_Mv2_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB @@ -1257,18 +1182,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L4_Mv2_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB subs counterL, counterL, #1 bne .Ldgemm_kernel_L4_Mv2_46 .Ldgemm_kernel_L4_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [pA2] - prfm PLDL1KEEP, [pA2, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x4 mov pA1, pA2 // pA1 = pA2 @@ -1304,16 +1223,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L4_Mv1_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB @@ -1328,16 +1243,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L4_Mv1_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB subs counterL, counterL, #1 bne .Ldgemm_kernel_L4_Mv1_46 .Ldgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x4 @@ -1393,12 +1304,10 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L2_Mv2_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x2_SUB KERNELv2x2_SUB KERNELv2x2_SUB KERNELv2x2_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x2_SUB KERNELv2x2_SUB KERNELv2x2_SUB @@ -1415,18 +1324,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L2_Mv2_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x2_SUB subs counterL, counterL, #1 bne .Ldgemm_kernel_L2_Mv2_46 .Ldgemm_kernel_L2_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [pA2] - prfm PLDL1KEEP, [pA2, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x2 mov pA1, pA2 // pA1 = pA2 @@ -1463,12 +1366,10 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L2_Mv1_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB @@ -1485,16 +1386,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L2_Mv1_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB subs counterL, counterL, #1 bne .Ldgemm_kernel_L2_Mv1_46 .Ldgemm_kernel_L2_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x2 @@ -1550,7 +1447,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L1_Mv2_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x1_SUB KERNELv2x1_SUB KERNELv2x1_SUB @@ -1571,16 +1467,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L1_Mv2_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x1_SUB subs counterL, counterL, #1 bgt .Ldgemm_kernel_L1_Mv2_46 .Ldgemm_kernel_L1_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x1 mov pA1, pA2 // pA1 = pA2 @@ -1617,7 +1509,6 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L1_Mv1_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB KERNELv1x1_SUB KERNELv1x1_SUB @@ -1638,16 +1529,12 @@ With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Ldgemm_kernel_L1_Mv1_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB subs counterL, counterL, #1 bgt .Ldgemm_kernel_L1_Mv1_46 .Ldgemm_kernel_L1_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x1 diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c new file mode 100644 index 0000000000..4607ebc59c --- /dev/null +++ b/kernel/arm64/dot.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2022, Arm Ltd +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +// Some compilers will report feature support for SVE without the appropriate +// header available +#ifdef HAVE_SVE +#if defined __has_include +#if __has_include() && __ARM_FEATURE_SVE +#define USE_SVE +#endif +#endif +#endif + +#ifdef USE_SVE +#include "dot_kernel_sve.c" +#endif +#include "dot_kernel_asimd.c" + +#if defined(SMP) +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, + BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + RETURN_TYPE dot = 0.0 ; + + if ( n <= 0 ) return dot; + +#ifdef USE_SVE + if (inc_x == 1 && inc_y == 1) { + return dot_kernel_sve(n, x, y); + } +#endif + + return dot_kernel_asimd(n, x, inc_x, y, inc_y); +} + +#if defined(SMP) +static int dot_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) +{ + *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); + + return 0; +} +#endif + +RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + RETURN_TYPE dot = 0.0; + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + dot = dot_compute(n, x, inc_x, y, inc_y); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) * 2]; + RETURN_TYPE *ptr; + +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, result, 0, + ( void *)dot_thread_function, nthreads); + + ptr = (RETURN_TYPE *)result; + for (i = 0; i < nthreads; i++) { + dot = dot + (*ptr); + ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot = dot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot; +} diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_kernel_asimd.c similarity index 53% rename from kernel/arm64/dot_thunderx2t99.c rename to kernel/arm64/dot_kernel_asimd.c index 3940acdddc..1288838f87 100644 --- a/kernel/arm64/dot_thunderx2t99.c +++ b/kernel/arm64/dot_kernel_asimd.c @@ -1,5 +1,6 @@ /*************************************************************************** Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2022, Arm Ltd All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RETURN_TYPE double #endif -#define N "x0" /* vector length */ -#define X "x1" /* "X" vector address */ -#define INC_X "x2" /* "X" stride */ -#define Y "x3" /* "Y" vector address */ -#define INC_Y "x4" /* "Y" stride */ -#define J "x5" /* loop variable */ - #if !defined(DOUBLE) #if !defined(DSDOT) +#define DOT_MOD "s" #define REG0 "wzr" -#define DOTF "s0" #define TMPX "s16" #define TMPY "s24" #define INC_SHIFT "2" #define N_DIV_SHIFT "6" #define N_REM_MASK "63" #else +#define DOT_MOD "d" #define REG0 "xzr" -#define DOTF "d0" #define TMPX "s16" #define TMPX1 "d2" #define TMPY "s24" @@ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N_REM_MASK "15" #endif #else +#define DOT_MOD "d" #define REG0 "xzr" -#define DOTF "d0" #define TMPX "d16" #define TMPY "d24" #define INC_SHIFT "3" @@ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N_REM_MASK "31" #endif +#define OUT "%"DOT_MOD"[DOT_]" + #if !defined(DOUBLE) #if !defined(DSDOT) #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ - " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ + " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" #define KERNEL_F \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.4s, v16.4s, v24.4s \n" \ " fmla v1.4s, v17.4s, v25.4s \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.4s, v18.4s, v26.4s \n" \ " fmla v3.4s, v19.4s, v27.4s \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.4s, v20.4s, v28.4s \n" \ " fmla v5.4s, v21.4s, v29.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.4s, v22.4s, v30.4s \n" \ " fmla v7.4s, v23.4s, v31.4s \n" @@ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /* !defined(DSDOT) */ #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ " fcvt "TMPX1", "TMPX" \n" \ " fcvt "TMPY1", "TMPY" \n" \ " fmul "TMPX1", "TMPX1", "TMPY1" \n" \ - " fadd "DOTF", "DOTF", "TMPX1" \n" + " fadd "OUT", "OUT", "TMPX1" \n" #define KERNEL_F \ - " ldp q18, q19, ["X"] \n" \ - " ldp q26, q27, ["Y"] \n" \ + " ldp q18, q19, [%[X_]] \n" \ + " ldp q26, q27, [%[Y_]] \n" \ " fcvtl v16.2d, v18.2s \n" \ " fcvtl2 v17.2d, v18.4s \n" \ " fcvtl v18.2d, v19.2s \n" \ @@ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fcvtl2 v25.2d, v26.4s \n" \ " fcvtl v26.2d, v27.2s \n" \ " fcvtl2 v27.2d, v27.4s \n" \ - " ldp q22, q23, ["X", #32] \n" \ - " ldp q30, q31, ["Y", #32] \n" \ + " ldp q22, q23, [%[X_], #32] \n" \ + " ldp q30, q31, [%[Y_], #32] \n" \ " fcvtl v20.2d, v22.2s \n" \ " fcvtl2 v21.2d, v22.4s \n" \ " fcvtl v22.2d, v23.2s \n" \ @@ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fcvtl2 v29.2d, v30.4s \n" \ " fcvtl v30.2d, v31.2s \n" \ " fcvtl2 v31.2d, v31.4s \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " add "Y", "Y", #64 \n" \ - " add "X", "X", #64 \n" \ + " add %[Y_], %[Y_], #64 \n" \ + " add %[X_], %[X_], #64 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ @@ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ - " faddp "DOTF", v0.2d \n" + " faddp "OUT", v0.2d \n" #endif /* !defined(DSDOT) */ #else /* !defined(DOUBLE) */ #define KERNEL_F1 \ - " ldr "TMPX", ["X"] \n" \ - " ldr "TMPY", ["Y"] \n" \ - " add "X", "X", "INC_X" \n" \ - " add "Y", "Y", "INC_Y" \n" \ - " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" + " ldr "TMPX", [%[X_]] \n" \ + " ldr "TMPY", [%[Y_]] \n" \ + " add %[X_], %[X_], %[INCX_] \n" \ + " add %[Y_], %[Y_], %[INCY_] \n" \ + " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" #define KERNEL_F \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" \ - " ldp q16, q17, ["X"] \n" \ - " ldp q24, q25, ["Y"] \n" \ - " ldp q18, q19, ["X", #32] \n" \ - " ldp q26, q27, ["Y", #32] \n" \ + " ldp q16, q17, [%[X_]] \n" \ + " ldp q24, q25, [%[Y_]] \n" \ + " ldp q18, q19, [%[X_], #32] \n" \ + " ldp q26, q27, [%[Y_], #32] \n" \ " fmla v0.2d, v16.2d, v24.2d \n" \ " fmla v1.2d, v17.2d, v25.2d \n" \ - " ldp q20, q21, ["X", #64] \n" \ - " ldp q28, q29, ["Y", #64] \n" \ + " ldp q20, q21, [%[X_], #64] \n" \ + " ldp q28, q29, [%[Y_], #64] \n" \ " fmla v2.2d, v18.2d, v26.2d \n" \ " fmla v3.2d, v19.2d, v27.2d \n" \ - " ldp q22, q23, ["X", #96] \n" \ - " ldp q30, q31, ["Y", #96] \n" \ - " add "Y", "Y", #128 \n" \ - " add "X", "X", #128 \n" \ + " ldp q22, q23, [%[X_], #96] \n" \ + " ldp q30, q31, [%[Y_], #96] \n" \ + " add %[Y_], %[Y_], #128 \n" \ + " add %[X_], %[X_], #128 \n" \ " fmla v4.2d, v20.2d, v28.2d \n" \ " fmla v5.2d, v21.2d, v29.2d \n" \ - " PRFM PLDL1KEEP, ["X", #896] \n" \ - " PRFM PLDL1KEEP, ["Y", #896] \n" \ - " PRFM PLDL1KEEP, ["X", #896+64] \n" \ - " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ + " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ + " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ " fmla v6.2d, v22.2d, v30.2d \n" \ " fmla v7.2d, v23.2d, v31.2d \n" @@ -261,28 +257,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v0.2d, v0.2d, v2.2d \n" \ " fadd v4.2d, v4.2d, v6.2d \n" \ " fadd v0.2d, v0.2d, v4.2d \n" \ - " faddp "DOTF", v0.2d \n" + " faddp "OUT", v0.2d \n" #endif /* !defined(DOUBLE) */ -#if defined(SMP) -extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, - BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, - void *c, BLASLONG ldc, int (*function)(), int nthreads); -#endif - -static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - RETURN_TYPE dot = 0.0 ; - - if ( n < 0 ) return dot; + RETURN_TYPE dot = 0.0; + BLASLONG j = 0; __asm__ __volatile__ ( - " mov "N", %[N_] \n" - " mov "X", %[X_] \n" - " mov "INC_X", %[INCX_] \n" - " mov "Y", %[Y_] \n" - " mov "INC_Y", %[INCY_] \n" - " fmov "DOTF", "REG0" \n" + " fmov "OUT", "REG0" \n" " fmov d1, xzr \n" " fmov d2, xzr \n" " fmov d3, xzr \n" @@ -290,42 +274,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " fmov d5, xzr \n" " fmov d6, xzr \n" " fmov d7, xzr \n" - " cmp "N", xzr \n" - " ble 9f //dot_kernel_L999 \n" - " cmp "INC_X", #1 \n" + " cmp %[INCX_], #1 \n" " bne 5f //dot_kernel_S_BEGIN \n" - " cmp "INC_Y", #1 \n" + " cmp %[INCY_], #1 \n" " bne 5f //dot_kernel_S_BEGIN \n" "1: //dot_kernel_F_BEGIN: \n" - " lsl "INC_X", "INC_X", "INC_SHIFT" \n" - " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" - " asr "J", "N", #"N_DIV_SHIFT" \n" - " cmp "J", xzr \n" + " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" + " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" + " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" + " cmp %[J_], xzr \n" " beq 3f //dot_kernel_F1 \n" " .align 5 \n" "2: //dot_kernel_F: \n" " "KERNEL_F" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 2b //dot_kernel_F \n" " "KERNEL_F_FINALIZE" \n" "3: //dot_kernel_F1: \n" - " ands "J", "N", #"N_REM_MASK" \n" + " ands %[J_], %[N_], #"N_REM_MASK" \n" " ble 9f //dot_kernel_L999 \n" "4: //dot_kernel_F10: \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 4b //dot_kernel_F10 \n" " b 9f //dot_kernel_L999 \n" "5: //dot_kernel_S_BEGIN: \n" - " lsl "INC_X", "INC_X", "INC_SHIFT" \n" - " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" - " asr "J", "N", #2 \n" - " cmp "J", xzr \n" + " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" + " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" + " asr %[J_], %[N_], #2 \n" + " cmp %[J_], xzr \n" " ble 7f //dot_kernel_S1 \n" "6: //dot_kernel_S4: \n" @@ -333,88 +315,31 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B " "KERNEL_F1" \n" " "KERNEL_F1" \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 6b //dot_kernel_S4 \n" "7: //dot_kernel_S1: \n" - " ands "J", "N", #3 \n" + " ands %[J_], %[N_], #3 \n" " ble 9f //dot_kernel_L999 \n" "8: //dot_kernel_S10: \n" " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" + " subs %[J_], %[J_], #1 \n" " bne 8b //dot_kernel_S10 \n" "9: //dot_kernel_L999: \n" - " str "DOTF", [%[DOT_]] \n" - : - : [DOT_] "r" (&dot), //%0 - [N_] "r" (n), //%1 - [X_] "r" (x), //%2 - [INCX_] "r" (inc_x), //%3 - [Y_] "r" (y), //%4 - [INCY_] "r" (inc_y) //%5 + : [DOT_] "=&w" (dot) + : [N_] "r" (n), + [X_] "r" (x), + [INCX_] "r" (inc_x), + [Y_] "r" (y), + [INCY_] "r" (inc_y), + [J_] "r" (j) : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + "d1", "d2", "d3", "d4", "d5", "d6", "d7" ); return dot; } - -#if defined(SMP) -static int dot_thread_function(BLASLONG n, BLASLONG dummy0, - BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) -{ - *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); - - return 0; -} -#endif - -RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ -#if defined(SMP) - int nthreads; - FLOAT dummy_alpha; -#endif - RETURN_TYPE dot = 0.0; - -#if defined(SMP) - if (inc_x == 0 || inc_y == 0 || n <= 10000) - nthreads = 1; - else - nthreads = num_cpu_avail(1); - - if (nthreads == 1) { - dot = dot_compute(n, x, inc_x, y, inc_y); - } else { - int mode, i; - char result[MAX_CPU_NUMBER * sizeof(double) * 2]; - RETURN_TYPE *ptr; - -#if !defined(DOUBLE) - mode = BLAS_SINGLE | BLAS_REAL; -#else - mode = BLAS_DOUBLE | BLAS_REAL; -#endif - - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, - x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); - - ptr = (RETURN_TYPE *)result; - for (i = 0; i < nthreads; i++) { - dot = dot + (*ptr); - ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); - } - } -#else - dot = dot_compute(n, x, inc_x, y, inc_y); -#endif - - return dot; -} diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c new file mode 100644 index 0000000000..8460e0d5ec --- /dev/null +++ b/kernel/arm64/dot_kernel_sve.c @@ -0,0 +1,66 @@ +/*************************************************************************** +Copyright (c) 2022, Arm Ltd +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { + SVE_TYPE acc_a = SVE_ZERO; + SVE_TYPE acc_b = SVE_ZERO; + + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) { + svbool_t pg_a = SVE_WHILELT(i, n); + svbool_t pg_b = SVE_WHILELT(i + sve_width, n); + + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + + acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); + acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); + } + + return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); +} diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c index 66e7dd38af..4c1385fbed 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c @@ -37,9 +37,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { - if (alpha == 1.0f) - return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); - else - return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); - return 0; + if (alpha == 1.0f) + return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); + return 0; } diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c index 7d53b1aa01..26ea7ee61b 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c @@ -30,636 +30,442 @@ #include "common.h" +#define INIT_C(M, N) mc##M##N = svdup_f32(0); + +#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + +#define INIT_C_8x4 \ + do { \ + INIT_C(0, 0); \ + INIT_C(0, 1); \ + INIT_C(1, 0); \ + INIT_C(1, 1); \ + INIT_C(2, 0); \ + INIT_C(2, 1); \ + INIT_C(3, 0); \ + INIT_C(3, 1); \ + } while (0); + #ifdef ALPHA_ONE -#define LOAD_C(M, N) \ - mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svadd_z((PG), SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#else +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#endif -#define LOAD_C_LOW(M, N) \ - mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc); +#ifdef ALPHA_ONE +int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG pad_k = (k + 3) & ~3; + + svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; + svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, + vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, + oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; + svfloat32_t svalpha = svdup_f32(alpha); + + svbool_t pg16 = svptrue_b16(); + svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbool_t pg32 = svptrue_b32(); + svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); + svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); + + bfloat16_t *ptr_a = (bfloat16_t *)A; + bfloat16_t *ptr_b = (bfloat16_t *)B; + FLOAT *ptr_c = C; + + bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; + bfloat16_t *ptr_b0, *ptr_b1; + FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + + for (BLASLONG j = 0; j < n / 4; j++) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C_8x4; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + MATMUL(2, 0); MATMUL(2, 1); + MATMUL(3, 0); MATMUL(3, 1); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + vc4 = svuzp1(mc01, mc11); + vc5 = svuzp1(mc21, mc31); + vc6 = svuzp2(mc01, mc11); + vc7 = svuzp2(mc21, mc31); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0+4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1+4, oc3, vc3); + UPDATE_C(pg32, ptr_c2, oc4, vc4) + UPDATE_C(pg32, ptr_c2+4, oc5, vc5); + UPDATE_C(pg32, ptr_c3, oc6, vc6) + UPDATE_C(pg32, ptr_c3+4, oc7, vc7); + + ptr_c0 += 8; + ptr_c1 += 8; + ptr_c2 += 8; + ptr_c3 += 8; + } -#define LOAD_C_EVEN(M, N) \ - mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc); + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + INIT_C(1, 0); INIT_C(1, 1); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + vc2 = svuzp1(mc01, mc11); + vc3 = svuzp2(mc01, mc11); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + UPDATE_C(pg32, ptr_c2, oc2, vc2); + UPDATE_C(pg32, ptr_c3, oc3, vc3); + + ptr_c0 += 4; + ptr_c1 += 4; + ptr_c2 += 4; + ptr_c3 += 4; + } -#define LOAD_C_FIRST(M, N) \ - mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc); + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 8; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + vc2 = svuzp1(mc01, mc01); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + UPDATE_C(pg32_low, ptr_c2, oc2, vc2); + UPDATE_C(pg32_low, ptr_c3, oc3, vc3); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } -#define STORE_C(M, N) \ - svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; -#define STORE_C_LOW(M, N) \ - svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); -#define STORE_C_EVEN(M, N) \ - svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); + MATMUL(0, 0); MATMUL(0, 1); -#define STORE_C_FIRST(M, N) \ - svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); + ptr_a0 += 4; + ptr_b0 += 16; + } -#else -#define LOAD_C(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc); + vc1 = svuzp2(mc00, mc00); + vc3 = svuzp2(mc01, mc01); -#define LOAD_C_LOW(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc); + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first, ptr_c2, oc2, mc01); + UPDATE_C(pg32_first, ptr_c3, oc3, vc3); -#define LOAD_C_EVEN(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc); + } -#define LOAD_C_FIRST(M, N) \ - mc##M##N = svdup_f32(0); \ - oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc); + ptr_b += 4 * pad_k; + } -#define STORE_C(M, N) \ - mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N); + if (n & 2) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + ptr_a = (bfloat16_t *)A; -#define STORE_C_LOW(M, N) \ - mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N); + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; -#define STORE_C_EVEN(M, N) \ - mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N); + ptr_b0 = ptr_b; -#define STORE_C_FIRST(M, N) \ - mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \ - svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N); + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); -#endif + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); -#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M); + mb0 = svld1_bf16(pg16, ptr_b0); -#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N); + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); -#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + ptr_a0 += 32; + ptr_b0 += 8; + } -#define LOAD_KREST_1(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \ - *(ptr_##NAME##M + 1), zero, zero, zero); + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); -#define LOAD_KREST_1_LOW(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \ - zero, zero); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3); -#define LOAD_KREST_2(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \ - *(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero); + ptr_c0 += 8; + ptr_c1 += 8; + } -#define LOAD_KREST_2_LOW(NAME, M) \ - m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \ - zero, zero, zero, zero, zero); + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; -#define LOAD_KREST_3(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ - *(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \ - *(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero); + INIT_C(0, 0); + INIT_C(1, 0); -#define LOAD_KREST_3_LOW(NAME, M) \ - m##NAME##M = \ - svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \ - *(ptr_##NAME##M + 2), zero, zero, zero, zero, zero); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 8; + } + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + + ptr_c0 += 4; + ptr_c1 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + + ptr_c0 += 2; + ptr_c1 += 2; -#ifdef ALPHA_ONE -int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#else -int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#endif -{ - bfloat16_t *ptr_a = (bfloat16_t *)A; - bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; - - bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; - bfloat16_t *ptr_b0, *ptr_b1; - FLOAT *ptr_c00, *ptr_c01; - - svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; - svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31; -#ifndef ALPHA_ONE - svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31; -#endif - svbool_t pg16 = svptrue_b16(); - svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbool_t pg32 = svptrue_b32(); - svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); - svbool_t pg32_even = svdupq_b32(1, 0, 1, 0); - svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); - svfloat32_t svalpha = svdup_f32(alpha); - bfloat16 tmp = 0; - bfloat16_t zero = *((bfloat16_t *)&tmp); - BLASLONG krest = k & 3; - - // 00 01 10 11 - svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1); - - for (BLASLONG j = 0; j < n / 4; j++) { - ptr_c00 = ptr_c; - ptr_c01 = ptr_c + 2 * ldc; - ptr_c += 4 * ldc; - - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - LOAD_C(2, 0); LOAD_C(2, 1); - LOAD_C(3, 0); LOAD_C(3, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - } - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - STORE_C(2, 0); STORE_C(2, 1); - STORE_C(3, 0); STORE_C(3, 1); - - ptr_c00 += 8; ptr_c01 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - } - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - - ptr_c00 += 4; ptr_c01 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C(0, 0); LOAD_C(0, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 8; - ptr_b0 += 8; ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C(0, 0); STORE_C(0, 1); - ptr_c00 += 2; ptr_c01 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + 2 * k; - - LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1); - - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - LOAD_B(0); LOAD_B(1); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 4; - ptr_b0 += 8; - ptr_b1 += 8; - } - - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1); - } - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C_LOW(0, 0); STORE_C_LOW(0, 1); - } - - ptr_b += 4 * k; } - if (n & 2) { - ptr_c00 = ptr_c; - ptr_c += 2 * ldc; - - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - LOAD_C(2, 0); - LOAD_C(3, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - LOAD_B(0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - } - - STORE_C(0, 0); - STORE_C(1, 0); - STORE_C(2, 0); - STORE_C(3, 0); - - ptr_c00 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - LOAD_C(1, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - LOAD_B(0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C(0, 0) - STORE_C(1, 0) - - ptr_c00 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - LOAD_B(0); - MATMUL(0, 0); - ptr_a0 += 8; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - } - STORE_C(0, 0); - ptr_c00 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - - LOAD_C(0, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - LOAD_B(0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 8; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1(b, 0); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2(b, 0); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3(b, 0); - } - MATMUL(0, 0); - } - STORE_C_LOW(0, 0); - } - - ptr_b += 2 * k; + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 8; + } + vc1 = svuzp2(mc00, mc00); + + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { + ptr_c0 = ptr_c; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 4; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + + ptr_c0 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + INIT_C(0, 0); + INIT_C(1, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc10); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + ptr_c0 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + ptr_c0 += 2; } - if (n & 1) { - ptr_c00 = ptr_c; - ptr_a = (bfloat16_t *) A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a2 = ptr_a1 + 2 * k; - ptr_a3 = ptr_a2 + 2 * k; - ptr_a += 8 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - LOAD_C_EVEN(1, 0); - LOAD_C_EVEN(2, 0); - LOAD_C_EVEN(3, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - } - STORE_C_EVEN(0, 0) - STORE_C_EVEN(1, 0); - STORE_C_EVEN(2, 0); - STORE_C_EVEN(3, 0); - - ptr_c00 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a0 + 2 * k; - ptr_a += 4 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - LOAD_C_EVEN(1, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); LOAD_A(1); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - - ptr_a0 += 8; ptr_a1 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C_EVEN(0, 0) - STORE_C_EVEN(1, 0) - - ptr_c00 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * k; - - ptr_b0 = ptr_b; - - LOAD_C_EVEN(0, 0); - - for (BLASLONG p = 0; p < k / 4; p++) { - LOAD_A(0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1(a, 0); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2(a, 0); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3(a, 0); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - } - STORE_C_EVEN(0, 0); - ptr_c00 += 2; - } - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - LOAD_C_FIRST(0, 0); - for (BLASLONG p = 0; p < k / 4; p++) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 4; - ptr_b0 += 4; - } - if (krest) { - if (krest == 1) { - LOAD_KREST_1_LOW(a, 0); - LOAD_KREST_1_LOW(b, 0); - } else if (krest == 2) { - LOAD_KREST_2_LOW(a, 0); - LOAD_KREST_2_LOW(b, 0); - } else if (krest == 3) { - LOAD_KREST_3_LOW(a, 0); - LOAD_KREST_3_LOW(b, 0); - } - MATMUL(0, 0); - } - STORE_C_FIRST(0, 0); - } + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 4; + } + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); } + } - return 0; -} \ No newline at end of file + return 0; +} diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c new file mode 100644 index 0000000000..22978a388c --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c @@ -0,0 +1,126 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[4]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_ncopy_8_neoversen2.c b/kernel/arm64/sbgemm_ncopy_8_neoversen2.c new file mode 100644 index 0000000000..1b13c8de9f --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_8_neoversen2.c @@ -0,0 +1,179 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[8]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3, v4, v5, v6, v7; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offsetx[4] = a_offsetx[3] + lda; + a_offsetx[5] = a_offsetx[4] + lda; + a_offsetx[6] = a_offsetx[5] + lda; + a_offsetx[7] = a_offsetx[6] + lda; + a_offset += 8 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + v4 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[4]); + v5 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[5]); + v6 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[6]); + v7 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[7]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 16, v4); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 20, v5); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 24, v6); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 28, v7); + + b_offset += 32; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + a_offsetx[4] += 4; + a_offsetx[5] += 4; + a_offsetx[6] += 4; + a_offsetx[7] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 8; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 32; + } + } + + if (n & 4) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_ncopy_neoversen2.c b/kernel/arm64/sbgemm_ncopy_neoversen2.c deleted file mode 100644 index 594067ebb9..0000000000 --- a/kernel/arm64/sbgemm_ncopy_neoversen2.c +++ /dev/null @@ -1,101 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - - a_offset = a; - b_offset = b; - - for (BLASLONG j = 0; j < n / 2; j++) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset1 + 2); - *(b_offset + 3) = *(a_offset1 + 3); - *(b_offset + 4) = *(a_offset2 + 0); - *(b_offset + 5) = *(a_offset2 + 1); - *(b_offset + 6) = *(a_offset2 + 2); - *(b_offset + 7) = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - b_offset += 8; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset1 + 2); - *(b_offset + 3) = *(a_offset2 + 0); - *(b_offset + 4) = *(a_offset2 + 1); - *(b_offset + 5) = *(a_offset2 + 2); - b_offset += 6; - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - *(b_offset + 2) = *(a_offset2 + 0); - *(b_offset + 3) = *(a_offset2 + 1); - b_offset += 4; - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - b_offset += 2; - } - } - if (n & 1) { - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - *(b_offset + 2) = *(a_offset + 2); - *(b_offset + 3) = *(a_offset + 3); - - b_offset += 4; - a_offset += 4; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - *(b_offset + 2) = *(a_offset + 2); - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset + 0); - *(b_offset + 1) = *(a_offset + 1); - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/arm64/sbgemm_tcopy_4_neoversen2.c b/kernel/arm64/sbgemm_tcopy_4_neoversen2.c new file mode 100644 index 0000000000..a652b0b2ac --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_4_neoversen2.c @@ -0,0 +1,147 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0_h = vld1_u16(a_offset0); + v1_h = vld1_u16(a_offset1); + v2_h = vld1_u16(a_offset2); + v3_h = vld1_u16(a_offset3); + + v4_h = vtrn1_u16(v0_h, v1_h); + v5_h = vtrn2_u16(v0_h, v1_h); + v6_h = vtrn1_u16(v2_h, v3_h); + v7_h = vtrn2_u16(v2_h, v3_h); + + v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + + vst1_u16(b_offset, v0_h); + vst1_u16(b_offset + 4, v1_h); + vst1_u16(b_offset + 8, v2_h); + vst1_u16(b_offset + 12, v3_h); + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +} diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c new file mode 100644 index 0000000000..459dfa16ab --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c @@ -0,0 +1,199 @@ +/*************************************************************************** + * Copyright (c) 2022, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7; + uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 8; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = vld1q_u16(a_offset0); + v1 = vld1q_u16(a_offset1); + v2 = vld1q_u16(a_offset2); + v3 = vld1q_u16(a_offset3); + + v4 = vtrn1q_u16(v0, v1); + v5 = vtrn2q_u16(v0, v1); + v6 = vtrn1q_u16(v2, v3); + v7 = vtrn2q_u16(v2, v3); + + v0 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v4, (uint32x4_t)v6); + v1 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v5, (uint32x4_t)v7); + v2 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v4, (uint32x4_t)v6); + v3 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v5, (uint32x4_t)v7); + + vst1_u16(b_offset, vget_low_u16(v0)); + vst1_u16(b_offset + 4, vget_low_u16(v1)); + vst1_u16(b_offset + 8, vget_low_u16(v2)); + vst1_u16(b_offset + 12, vget_low_u16(v3)); + vst1_u16(b_offset + 16, vget_high_u16(v0)); + vst1_u16(b_offset + 20, vget_high_u16(v1)); + vst1_u16(b_offset + 24, vget_high_u16(v2)); + vst1_u16(b_offset + 28, vget_high_u16(v3)); + + b_offset += 32; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 8; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 32; + } + } + + if (n & 4) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0_h = vld1_u16(a_offset0); + v1_h = vld1_u16(a_offset1); + v2_h = vld1_u16(a_offset2); + v3_h = vld1_u16(a_offset3); + + v4_h = vtrn1_u16(v0_h, v1_h); + v5_h = vtrn2_u16(v0_h, v1_h); + v6_h = vtrn1_u16(v2_h, v3_h); + v7_h = vtrn2_u16(v2_h, v3_h); + + v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + + vst1_u16(b_offset, v0_h); + vst1_u16(b_offset + 4, v1_h); + vst1_u16(b_offset + 8, v2_h); + vst1_u16(b_offset + 12, v3_h); + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +} diff --git a/kernel/arm64/sbgemm_tcopy_neoversen2.c b/kernel/arm64/sbgemm_tcopy_neoversen2.c deleted file mode 100644 index 2f33133793..0000000000 --- a/kernel/arm64/sbgemm_tcopy_neoversen2.c +++ /dev/null @@ -1,109 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2022, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - IFLOAT *b_offset; - a_offset = a; - b_offset = b; - - for (BLASLONG j = 0; j < n / 2; j++) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 2; - - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset3 + 0); - *(b_offset + 3) = *(a_offset4 + 0); - *(b_offset + 4) = *(a_offset1 + 1); - *(b_offset + 5) = *(a_offset2 + 1); - *(b_offset + 6) = *(a_offset3 + 1); - *(b_offset + 7) = *(a_offset4 + 1); - - b_offset += 8; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - a_offset4 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset3 + 0); - *(b_offset + 3) = *(a_offset1 + 1); - *(b_offset + 4) = *(a_offset2 + 1); - *(b_offset + 5) = *(a_offset3 + 1); - b_offset += 6; - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset2 + 0); - *(b_offset + 2) = *(a_offset1 + 1); - *(b_offset + 3) = *(a_offset2 + 1); - b_offset += 4; - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset1 + 0); - *(b_offset + 1) = *(a_offset1 + 1); - b_offset += 2; - } - } - } - if (n & 1) { - for (BLASLONG i = 0; i < m / 4; i++) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - *(b_offset + 2) = *(a_offset + lda * 2); - *(b_offset + 3) = *(a_offset + lda * 3); - - b_offset += 4; - a_offset += 4 * lda; - } - BLASLONG rest = m & 3; - if (rest == 3) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - *(b_offset + 2) = *(a_offset + lda * 2); - } else if (rest == 2) { - *(b_offset + 0) = *(a_offset); - *(b_offset + 1) = *(a_offset + lda); - } else if (rest == 1) { - *(b_offset + 0) = *(a_offset); - } - } - - return 0; -} diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S index 1cdd8253e9..c969ed4db4 100644 --- a/kernel/arm64/sgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -189,20 +189,16 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ ld1rw z11.s, p0/z, [pB, 12] fmla z24.s, p0/m, z0.s, z12.s fmla z25.s, p0/m, z1.s, z12.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rw z12.s, p0/z, [pB, 16] fmla z26.s, p0/m, z0.s, z13.s fmla z27.s, p0/m, z1.s, z13.s - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] ld1rw z13.s, p0/z, [pB, 20] fmla z28.s, p0/m, z0.s, z14.s fmla z29.s, p0/m, z1.s, z14.s ld1rw z14.s, p0/z, [pB, 24] fmla z30.s, p0/m, z0.s, z15.s fmla z31.s, p0/m, z1.s, z15.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] ld1rw z15.s, p0/z, [pB, 28] - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] add pB, pB, 32 .endm @@ -227,19 +223,15 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ ld1rw z11.s, p0/z, [pB, 12] fmla z24.s, p0/m, z0.s, z12.s fmla z25.s, p0/m, z1.s, z12.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rw z12.s, p0/z, [pB, 16] fmla z26.s, p0/m, z0.s, z13.s fmla z27.s, p0/m, z1.s, z13.s - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] ld1rw z13.s, p0/z, [pB, 20] fmla z28.s, p0/m, z0.s, z14.s fmla z29.s, p0/m, z1.s, z14.s ld1rw z14.s, p0/z, [pB, 24] fmla z30.s, p0/m, z0.s, z15.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] fmla z31.s, p0/m, z1.s, z15.s - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 @@ -265,7 +257,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ ld1rw z11.s, p0/z, [pB, 12] fmla z24.s, p0/m, z2.s, z12.s fmla z25.s, p0/m, z3.s, z12.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] ld1rw z12.s, p0/z, [pB, 16] fmla z26.s, p0/m, z2.s, z13.s fmla z27.s, p0/m, z3.s, z13.s @@ -291,7 +282,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z23.s, p0/m, z3.s, z11.s fmla z24.s, p0/m, z2.s, z12.s fmla z25.s, p0/m, z3.s, z12.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z26.s, p0/m, z2.s, z13.s fmla z27.s, p0/m, z3.s, z13.s fmla z28.s, p0/m, z2.s, z14.s @@ -322,25 +312,21 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z18.s, p0/m, z0.s, z9.s fmla z19.s, p0/m, z1.s, z9.s fmla z20.s, p0/m, z0.s, z10.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z21.s, p0/m, z1.s, z10.s fmla z22.s, p0/m, z0.s, z11.s fmla z23.s, p0/m, z1.s, z11.s fmla z24.s, p0/m, z0.s, z12.s - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] fmla z25.s, p0/m, z1.s, z12.s fmla z26.s, p0/m, z0.s, z13.s fmla z27.s, p0/m, z1.s, z13.s fmla z28.s, p0/m, z0.s, z14.s fmla z29.s, p0/m, z1.s, z14.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z30.s, p0/m, z0.s, z15.s fmla z31.s, p0/m, z1.s, z15.s .endm .macro SAVEv2x8 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z8.s, p0/z, [pCRow0] @@ -349,7 +335,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.s, p0/m, z17.s, alphaZ st1w z8.s, p0, [pCRow0] st1w z9.s, p0, [pCRow0, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z10.s, p0/z, [pCRow1] @@ -358,7 +343,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.s, p0/m, z19.s, alphaZ st1w z10.s, p0, [pCRow1] st1w z11.s, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z12.s, p0/z, [pCRow2] @@ -367,7 +351,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z13.s, p0/m, z21.s, alphaZ st1w z12.s, p0, [pCRow2] st1w z13.s, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z14.s, p0/z, [pCRow1] @@ -376,7 +359,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z15.s, p0/m, z23.s, alphaZ st1w z14.s, p0, [pCRow1] st1w z15.s, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z8.s, p0/z, [pCRow2] @@ -385,7 +367,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.s, p0/m, z25.s, alphaZ st1w z8.s, p0, [pCRow2] st1w z9.s, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z10.s, p0/z, [pCRow1] @@ -394,7 +375,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.s, p0/m, z27.s, alphaZ st1w z10.s, p0, [pCRow1] st1w z11.s, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z12.s, p0/z, [pCRow2] @@ -403,7 +383,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z13.s, p0/m, z29.s, alphaZ st1w z12.s, p0, [pCRow2] st1w z13.s, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1w z14.s, p0/z, [pCRow1] ld1w z15.s, p0/z, [pCRow1, #1, mul vl] @@ -443,10 +422,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.s, p0/m, z0.s, z8.s fmla z17.s, p0/m, z1.s, z8.s fmla z18.s, p0/m, z0.s, z9.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z19.s, p0/m, z1.s, z9.s fmla z20.s, p0/m, z0.s, z10.s - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] fmla z21.s, p0/m, z1.s, z10.s fmla z22.s, p0/m, z0.s, z11.s fmla z23.s, p0/m, z1.s, z11.s @@ -454,7 +431,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .macro SAVEv2x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z8.s, p0/z, [pCRow0] @@ -463,7 +439,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.s, p0/m, z17.s, alphaZ st1w z8.s, p0, [pCRow0] st1w z9.s, p0, [pCRow0, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z10.s, p0/z, [pCRow1] @@ -472,7 +447,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.s, p0/m, z19.s, alphaZ st1w z10.s, p0, [pCRow1] st1w z11.s, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z12.s, p0/z, [pCRow2] @@ -481,7 +455,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z13.s, p0/m, z21.s, alphaZ st1w z12.s, p0, [pCRow2] st1w z13.s, p0, [pCRow2, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1w z14.s, p0/z, [pCRow1] ld1w z15.s, p0/z, [pCRow1, #1, mul vl] @@ -514,15 +487,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.s, p0/m, z0.s, z8.s fmla z17.s, p0/m, z1.s, z8.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z18.s, p0/m, z0.s, z9.s fmla z19.s, p0/m, z1.s, z9.s - prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] .endm .macro SAVEv2x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z8.s, p0/z, [pCRow0] @@ -531,7 +501,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z9.s, p0/m, z17.s, alphaZ st1w z8.s, p0, [pCRow0] st1w z9.s, p0, [pCRow0, #1, mul vl] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1w z10.s, p0/z, [pCRow1] ld1w z11.s, p0/z, [pCRow1, #1, mul vl] @@ -539,7 +508,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z11.s, p0/m, z19.s, alphaZ st1w z10.s, p0, [pCRow1] st1w z11.s, p0, [pCRow1, #1, mul vl] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] @@ -563,12 +531,10 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.s, p0/m, z0.s, z8.s fmla z17.s, p0/m, z1.s, z8.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] .endm .macro SAVEv2x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z8.s, p0/z, [pCRow0] @@ -618,14 +584,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z19.s, p1/m, z0.s, z11.s ld1rw z11.s, p0/z, [pB, 12] fmla z20.s, p1/m, z0.s, z12.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rw z12.s, p0/z, [pB, 16] fmla z21.s, p1/m, z0.s, z13.s ld1rw z13.s, p0/z, [pB, 20] fmla z22.s, p1/m, z0.s, z14.s ld1rw z14.s, p0/z, [pB, 24] fmla z23.s, p1/m, z0.s, z15.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 @@ -644,14 +608,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z19.s, p1/m, z0.s, z11.s ld1rw z11.s, p0/z, [pB, 12] fmla z20.s, p1/m, z0.s, z12.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] ld1rw z12.s, p0/z, [pB, 16] fmla z21.s, p1/m, z0.s, z13.s ld1rw z13.s, p0/z, [pB, 20] fmla z22.s, p1/m, z0.s, z14.s ld1rw z14.s, p0/z, [pB, 24] fmla z23.s, p1/m, z0.s, z15.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 @@ -671,7 +633,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ ld1rw z11.s, p0/z, [pB, 12] fmla z20.s, p1/m, z1.s, z12.s ld1rw z12.s, p0/z, [pB, 16] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.s, p1/m, z1.s, z13.s ld1rw z13.s, p0/z, [pB, 20] fmla z22.s, p1/m, z1.s, z14.s @@ -688,7 +649,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z18.s, p1/m, z1.s, z10.s fmla z19.s, p1/m, z1.s, z11.s fmla z20.s, p1/m, z1.s, z12.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.s, p1/m, z1.s, z13.s fmla z22.s, p1/m, z1.s, z14.s fmla z23.s, p1/m, z1.s, z15.s @@ -712,11 +672,9 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.s, p1/m, z0.s, z8.s fmla z17.s, p1/m, z0.s, z9.s fmla z18.s, p1/m, z0.s, z10.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z19.s, p1/m, z0.s, z11.s fmla z20.s, p1/m, z0.s, z12.s fmla z21.s, p1/m, z0.s, z13.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z22.s, p1/m, z0.s, z14.s fmla z23.s, p1/m, z0.s, z15.s @@ -725,49 +683,41 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .macro SAVEv1x8 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z24.s, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaZ st1w z24.s, p1, [pCRow0] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z25.s, p1/z, [pCRow1] fmla z25.s, p1/m, z17.s, alphaZ st1w z25.s, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z26.s, p1/z, [pCRow2] fmla z26.s, p1/m, z18.s, alphaZ st1w z26.s, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z27.s, p1/z, [pCRow1] fmla z27.s, p1/m, z19.s, alphaZ st1w z27.s, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z28.s, p1/z, [pCRow2] fmla z28.s, p1/m, z20.s, alphaZ st1w z28.s, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z29.s, p1/z, [pCRow1] fmla z29.s, p1/m, z21.s, alphaZ st1w z29.s, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z30.s, p1/z, [pCRow2] fmla z30.s, p1/m, z22.s, alphaZ st1w z30.s, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1w z31.s, p1/z, [pCRow1] fmla z31.s, p1/m, z23.s, alphaZ @@ -799,7 +749,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ fmla z16.s, p1/m, z0.s, z8.s fmla z17.s, p1/m, z0.s, z9.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z18.s, p1/m, z0.s, z10.s fmla z19.s, p1/m, z0.s, z11.s @@ -807,25 +756,21 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z24.s, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaZ st1w z24.s, p1, [pCRow0] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1w z25.s, p1/z, [pCRow1] fmla z25.s, p1/m, z17.s, alphaZ st1w z25.s, p1, [pCRow1] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1w z26.s, p1/z, [pCRow2] fmla z26.s, p1/m, z18.s, alphaZ st1w z26.s, p1, [pCRow2] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1w z27.s, p1/z, [pCRow1] fmla z27.s, p1/m, z19.s, alphaZ @@ -852,20 +797,17 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ add pB, pB, 8 fmla z16.s, p1/m, z0.s, z8.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] fmla z17.s, p1/m, z0.s, z9.s .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC ld1w z24.s, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaZ st1w z24.s, p1, [pCRow0] - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld1w z25.s, p1/z, [pCRow1] fmla z25.s, p1/m, z17.s, alphaZ @@ -890,13 +832,11 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ add pB, pB, 4 fmla z16.s, p1/m, z0.s, z8.s - prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] ld1w z24.s, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaZ @@ -928,8 +868,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] fmov alpha, s0 dup alphaZ, alpha @@ -968,7 +906,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ /* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ mul temp, vec_len, origK // generate address of pA2 add pA2, pA1, temp, lsl #2 // pA1 = start of A array - prfm PLDL1KEEP, [pA2] .align 5 .Lsgemm_kernel_L8_Mv2_20: @@ -1057,11 +994,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ bne .Lsgemm_kernel_L8_Mv2_46 .Lsgemm_kernel_L8_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [pA2] - prfm PLDL1KEEP, [pA2, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x8 mov pA1, pA2 // pA1 = pA2 @@ -1171,9 +1103,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ bne .Lsgemm_kernel_L8_Mv1_46 .Lsgemm_kernel_L8_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x8 @@ -1233,16 +1162,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L4_Mv2_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB KERNELv2x4_SUB @@ -1257,18 +1182,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L4_Mv2_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x4_SUB subs counterL, counterL, #1 bne .Lsgemm_kernel_L4_Mv2_46 .Lsgemm_kernel_L4_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [pA2] - prfm PLDL1KEEP, [pA2, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x4 mov pA1, pA2 // pA1 = pA2 @@ -1304,16 +1223,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L4_Mv1_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB @@ -1328,16 +1243,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L4_Mv1_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB subs counterL, counterL, #1 bne .Lsgemm_kernel_L4_Mv1_46 .Lsgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x4 @@ -1393,12 +1304,10 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L2_Mv2_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x2_SUB KERNELv2x2_SUB KERNELv2x2_SUB KERNELv2x2_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x2_SUB KERNELv2x2_SUB KERNELv2x2_SUB @@ -1415,18 +1324,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L2_Mv2_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x2_SUB subs counterL, counterL, #1 bne .Lsgemm_kernel_L2_Mv2_46 .Lsgemm_kernel_L2_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [pA2] - prfm PLDL1KEEP, [pA2, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x2 mov pA1, pA2 // pA1 = pA2 @@ -1463,12 +1366,10 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L2_Mv1_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB @@ -1485,16 +1386,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L2_Mv1_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB subs counterL, counterL, #1 bne .Lsgemm_kernel_L2_Mv1_46 .Lsgemm_kernel_L2_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x2 @@ -1550,7 +1447,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L1_Mv2_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x1_SUB KERNELv2x1_SUB KERNELv2x1_SUB @@ -1571,16 +1467,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L1_Mv2_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv2x1_SUB subs counterL, counterL, #1 bgt .Lsgemm_kernel_L1_Mv2_46 .Lsgemm_kernel_L1_Mv2_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv2x1 mov pA1, pA2 // pA1 = pA2 @@ -1617,7 +1509,6 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L1_Mv1_22: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB KERNELv1x1_SUB KERNELv1x1_SUB @@ -1638,16 +1529,12 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ .align 5 .Lsgemm_kernel_L1_Mv1_46: - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB subs counterL, counterL, #1 bgt .Lsgemm_kernel_L1_Mv1_46 .Lsgemm_kernel_L1_Mv1_100: - prfm PLDL1KEEP, [pA1] - prfm PLDL1KEEP, [pA1, #64] - prfm PLDL1KEEP, [origPB] SAVEv1x1 diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S index 30450cc7d1..c819ee6fb1 100644 --- a/kernel/arm64/sgemm_ncopy_4.S +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -1,333 +1,333 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A00 x2 -#define LDA x3 -#define B00 x4 - -#define A01 x5 -#define A02 x6 -#define A03 x7 -#define A04 x8 - -#define I x9 -#define J x10 - -#define TEMP1 x11 -#define TEMP2 x12 - -#define A_PREFETCH 2560 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -.macro COPY4x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - ldr q2, [A03], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - - ldr q3, [A04], #16 - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] - add B00, B00, #64 - -.endm - -.macro COPY1x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - ldr s2, [A03], #4 - ldr s3, [A04], #4 - - stp s0, s1, [B00] - add B00, B00, #8 - stp s2, s3, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - - st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] - add B00, B00, #32 -.endm - - -.macro COPY1x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01], #4 - ldr s1, [A02], #4 - - stp s0, s1, [B00] - add B00, B00, #8 -.endm - -.macro COPY4x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01], #16 - str q0, [B00], #16 -.endm - - -.macro COPY1x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01], #4 - str s0, [B00], #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - -.Ldgemm_ncopy_L4_BEGIN: - - asr J, N, #2 // J = N / 4 - cmp J, #0 - ble .Ldgemm_ncopy_L2_BEGIN - - .align 5 -.Ldgemm_ncopy_L4_M4_BEGIN: - - mov A01, A00 - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A00, A04, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_40 - - .align 5 -.Ldgemm_ncopy_L4_M4_20: - - COPY4x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_20 - -.Ldgemm_ncopy_L4_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L4_M4_END - - .align 5 -.Ldgemm_ncopy_L4_M4_60: - - COPY1x4 - - subs I , I , #1 - bne .Ldgemm_ncopy_L4_M4_60 - -.Ldgemm_ncopy_L4_M4_END: - - subs J , J, #1 // j-- - bne .Ldgemm_ncopy_L4_M4_BEGIN - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L2_BEGIN: - - tst N, #3 - ble .Ldgemm_ncopy_L999 - - tst N, #2 - ble .Ldgemm_ncopy_L1_BEGIN - -.Ldgemm_ncopy_L2_M4_BEGIN: - mov A01, A00 - add A02, A01, LDA - add A00, A02, LDA - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_40 - - .align 5 -.Ldgemm_ncopy_L2_M4_20: - - COPY4x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_20 - -.Ldgemm_ncopy_L2_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L2_M4_END - - .align 5 -.Ldgemm_ncopy_L2_M4_60: - - COPY1x2 - - subs I , I , #1 - bne .Ldgemm_ncopy_L2_M4_60 - -.Ldgemm_ncopy_L2_M4_END: - - -/*********************************************************************************************/ - -.Ldgemm_ncopy_L1_BEGIN: - - tst N, #1 - ble .Ldgemm_ncopy_L999 - -.Ldgemm_ncopy_L1_M4_BEGIN: - - mov A01, A00 - - asr I, M, #2 // I = M / 4 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_40 - - .align 5 -.Ldgemm_ncopy_L1_M4_20: - - COPY4x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_20 - - -.Ldgemm_ncopy_L1_M4_40: - - and I, M , #3 - cmp I, #0 - ble .Ldgemm_ncopy_L1_M4_END - - .align 5 -.Ldgemm_ncopy_L1_M4_60: - - COPY1x1 - - subs I , I , #1 - bne .Ldgemm_ncopy_L1_M4_60 - - -.Ldgemm_ncopy_L1_M4_END: - -.Ldgemm_ncopy_L999: - - mov x0, #0 - RESTORE_REGS - ret - - EPILOGUE - +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A00 x2 +#define LDA x3 +#define B00 x4 + +#define A01 x5 +#define A02 x6 +#define A03 x7 +#define A04 x8 + +#define I x9 +#define J x10 + +#define TEMP1 x11 +#define TEMP2 x12 + +#define A_PREFETCH 2560 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro COPY4x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + ldr q2, [A03], #16 + ins v8.s[2], v2.s[0] + ins v9.s[2], v2.s[1] + ins v10.s[2], v2.s[2] + ins v11.s[2], v2.s[3] + + ldr q3, [A04], #16 + ins v8.s[3], v3.s[0] + ins v9.s[3], v3.s[1] + ins v10.s[3], v3.s[2] + ins v11.s[3], v3.s[3] + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] + add B00, B00, #64 + +.endm + +.macro COPY1x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + ldr s2, [A03], #4 + ldr s3, [A04], #4 + + stp s0, s1, [B00] + add B00, B00, #8 + stp s2, s3, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01], #16 + ins v8.s[0], v0.s[0] + ins v9.s[0], v0.s[1] + ins v10.s[0], v0.s[2] + ins v11.s[0], v0.s[3] + + ldr q1, [A02], #16 + ins v8.s[1], v1.s[0] + ins v9.s[1], v1.s[1] + ins v10.s[1], v1.s[2] + ins v11.s[1], v1.s[3] + + st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] + add B00, B00, #32 +.endm + + +.macro COPY1x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01], #4 + ldr s1, [A02], #4 + + stp s0, s1, [B00] + add B00, B00, #8 +.endm + +.macro COPY4x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01], #16 + str q0, [B00], #16 +.endm + + +.macro COPY1x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01], #4 + str s0, [B00], #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + +.Ldgemm_ncopy_L4_BEGIN: + + asr J, N, #2 // J = N / 4 + cmp J, #0 + ble .Ldgemm_ncopy_L2_BEGIN + + .align 5 +.Ldgemm_ncopy_L4_M4_BEGIN: + + mov A01, A00 + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A00, A04, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_40 + + .align 5 +.Ldgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_20 + +.Ldgemm_ncopy_L4_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L4_M4_END + + .align 5 +.Ldgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne .Ldgemm_ncopy_L4_M4_60 + +.Ldgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne .Ldgemm_ncopy_L4_M4_BEGIN + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble .Ldgemm_ncopy_L999 + + tst N, #2 + ble .Ldgemm_ncopy_L1_BEGIN + +.Ldgemm_ncopy_L2_M4_BEGIN: + mov A01, A00 + add A02, A01, LDA + add A00, A02, LDA + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_40 + + .align 5 +.Ldgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_20 + +.Ldgemm_ncopy_L2_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L2_M4_END + + .align 5 +.Ldgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne .Ldgemm_ncopy_L2_M4_60 + +.Ldgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +.Ldgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble .Ldgemm_ncopy_L999 + +.Ldgemm_ncopy_L1_M4_BEGIN: + + mov A01, A00 + + asr I, M, #2 // I = M / 4 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_40 + + .align 5 +.Ldgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_20 + + +.Ldgemm_ncopy_L1_M4_40: + + and I, M , #3 + cmp I, #0 + ble .Ldgemm_ncopy_L1_M4_END + + .align 5 +.Ldgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne .Ldgemm_ncopy_L1_M4_60 + + +.Ldgemm_ncopy_L1_M4_END: + +.Ldgemm_ncopy_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 431f1ae2a5..3066421bb9 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -1,814 +1,814 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M x0 -#define N x1 -#define A x2 -#define LDA x3 -#define B x4 - -#define M8 x5 - -#define A01 x6 -#define A02 x7 -#define A03 x8 -#define A04 x9 -#define A05 x10 -#define A06 x11 -#define A07 x12 -#define A08 x13 - -#define B01 x14 -#define B02 x15 -#define B03 x16 -#define B04 x17 -#define B00 x22 - - -#define I x21 -#define J x19 - -#define TEMP1 x20 - -#define A_PREFETCH 256 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ -.macro SAVE_REGS - add sp, sp, #-(11 * 16) - stp d8, d9, [sp, #(0 * 16)] - stp d10, d11, [sp, #(1 * 16)] - stp d12, d13, [sp, #(2 * 16)] - stp d14, d15, [sp, #(3 * 16)] - stp d16, d17, [sp, #(4 * 16)] - stp x18, x19, [sp, #(5 * 16)] - stp x20, x21, [sp, #(6 * 16)] - stp x22, x23, [sp, #(7 * 16)] - stp x24, x25, [sp, #(8 * 16)] - stp x26, x27, [sp, #(9 * 16)] - str x28, [sp, #(10 * 16)] -.endm - -.macro RESTORE_REGS - ldp d8, d9, [sp, #(0 * 16)] - ldp d10, d11, [sp, #(1 * 16)] - ldp d12, d13, [sp, #(2 * 16)] - ldp d14, d15, [sp, #(3 * 16)] - ldp d16, d17, [sp, #(4 * 16)] - ldp x18, x19, [sp, #(5 * 16)] - ldp x20, x21, [sp, #(6 * 16)] - ldp x22, x23, [sp, #(7 * 16)] - ldp x24, x25, [sp, #(8 * 16)] - ldp x26, x27, [sp, #(9 * 16)] - ldr x28, [sp, #(10 * 16)] - add sp, sp, #(11*16) -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - //prfm PSTL1KEEP, [B00, M8] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] - add A05, A05, #64 - - st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] - add A06, A06, #64 - - st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] - add A07, A07, #64 - - st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] - add A08, A08, #64 - - st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - add B00, B00, M8 - -.endm - -.macro COPY8x8 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - prfm PLDL1KEEP, [A05, #A_PREFETCH] - prfm PLDL1KEEP, [A06, #A_PREFETCH] - prfm PLDL1KEEP, [A07, #A_PREFETCH] - prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 - - ldp q8, q9, [A05] - ldp q10, q11, [A06] - add A05, A05, #32 - add A06, A06, #32 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] - add B01, B01, #64 - - ldp q12, q13, [A07] - ldp q14, q15, [A08] - add A07, A07, #32 - add A08, A08, #32 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - add B02, B02, #64 - - ldr q4, [A05] - ldr q5, [A06] - ldr q6, [A07] - ldr q7, [A08] - - add A05, A05, #16 - add A06, A06, #16 - add A07, A07, #16 - add A08, A08, #16 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] - add B02, B02, #64 -.endm - -.macro COPY2x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - add B03, B03, #16 - - ldr d4, [A05] - ldr d5, [A06] - ldr d6, [A07] - ldr d7, [A08] - - add A05, A05, #8 - add A06, A06, #8 - add A07, A07, #8 - add A08, A08, #8 - - stp d4, d5, [B03] - add B03, B03, #16 - stp d6, d7, [B03] - add B03, B03, #16 - -.endm - -.macro COPY1x8 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - //prfm PLDL1KEEP, [A05, #A_PREFETCH] - //prfm PLDL1KEEP, [A06, #A_PREFETCH] - //prfm PLDL1KEEP, [A07, #A_PREFETCH] - //prfm PLDL1KEEP, [A08, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - - ldr s4, [A05] - ldr s5, [A06] - ldr s6, [A07] - ldr s7, [A08] - - stp s4, s5, [B04] - add B04, B04, #8 - stp s6, s7, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ -.macro COPY16x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] - add A03, A03, #64 - - st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] - add TEMP1, TEMP1, #64 - - ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] - add A04, A04, #64 - - st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] - - add B00, B00, M8 -.endm - -.macro COPY8x4 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - prfm PLDL1KEEP, [A03, #A_PREFETCH] - prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldp q0, q1, [A01] - ldp q2, q3, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 - - ldp q4, q5, [A03] - ldp q6, q7, [A04] - add A03, A03, #32 - add A04, A04, #32 - - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - ldr q2, [A03] - ldr q3, [A04] - add A01, A01, #16 - add A02, A02, #16 - add A03, A03, #16 - add A04, A04, #16 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] - - add B02, B02, #64 -.endm - -.macro COPY2x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - ldr d2, [A03] - ldr d3, [A04] - - add A01, A01, #8 - add A02, A02, #8 - add A03, A03, #8 - add A04, A04, #8 - - stp d0, d1, [B03] - add B03, B03, #16 - stp d2, d3, [B03] - - add B03, B03, #16 -.endm - -.macro COPY1x4 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - //prfm PLDL1KEEP, [A03, #A_PREFETCH] - //prfm PLDL1KEEP, [A04, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - ldr s2, [A03] - ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 - - stp s0, s1, [B04] - add B04, B04, #8 - stp s2, s3, [B04] - add B04, B04, #8 - -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] - add A02, A02, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add TEMP1, B00, #64 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] - add B00, B00, M8 -.endm - -.macro COPY8x2 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ld1 {v0.4s, v1.4s}, [A01] - ld1 {v2.4s, v3.4s}, [A02] - add A01, A01, #32 - add A02, A02, #32 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] - add B01, B01, #64 -.endm - -.macro COPY4x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr q0, [A01] - ldr q1, [A02] - add A01, A01, #16 - add A02, A02, #16 - - stp q0, q1, [B02] - add B02, B02, #32 -.endm - -.macro COPY2x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr d0, [A01] - ldr d1, [A02] - - add A01, A01, #8 - add A02, A02, #8 - - stp d0, d1, [B03] - add B03, B03, #16 -.endm - -.macro COPY1x2 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - //prfm PLDL1KEEP, [A02, #A_PREFETCH] - - ldr s0, [A01] - ldr s1, [A02] - - add A01, A01, #4 - add A02, A02, #4 - - stp s0, s1, [B04] - - add B04, B04, #8 -.endm - -/*************************************************************************************************************************/ - -.macro COPY16x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] - add A01, A01, #64 - - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] - add B00, B00, M8 -.endm - -.macro COPY8x1 - prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldp q0, q1, [A01] - add A01, A01, #32 - stp q0, q1, [B01] - - add B01, B01, #32 -.endm - -.macro COPY4x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr q0, [A01] - add A01, A01, #16 - str q0, [B02] - - add B02, B02, #16 -.endm - -.macro COPY2x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr d0, [A01] - add A01, A01, #8 - str d0, [B03] - - add B03, B03, #8 -.endm - -.macro COPY1x1 - //prfm PLDL1KEEP, [A01, #A_PREFETCH] - - ldr s0, [A01] - add A01, A01, #4 - str s0, [B04] - - add B04, B04, #4 -.endm - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - SAVE_REGS - - lsl LDA, LDA, #2 // LDA = LDA * SIZE - - lsl TEMP1, M, #2 // TEMP1 = M * SIZE - - and B01 , N , #-16 - and B02 , N , #-8 - and B03 , N , #-4 - and B04 , N , #-2 - - mul B01, B01, TEMP1 - mul B02, B02, TEMP1 - mul B03, B03, TEMP1 - mul B04, B04, TEMP1 - - add B01 , B01, B - add B02 , B02, B - add B03 , B03, B - add B04 , B04, B - - lsl M8, M, #6 // M8 = M * 16 * SIZE - -.Lsgemm_tcopy_L8_BEGIN: - asr J, M, #3 // J = M / 8 - cmp J, #0 - ble .Lsgemm_tcopy_L4_BEGIN - - .align 5 -.Lsgemm_tcopy_L8_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A05, A04, LDA - add A06, A05, LDA - add A07, A06, LDA - add A08, A07, LDA - add A, A08, LDA - - mov B00, B - add B, B00, #512 // B = B + 8 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L8_M16_40 - - .align 5 -.Lsgemm_tcopy_L8_M16_20: - - COPY16x8 - - subs I , I , #1 - bne .Lsgemm_tcopy_L8_M16_20 - -.Lsgemm_tcopy_L8_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L8_M16_60 - - COPY8x8 - -.Lsgemm_tcopy_L8_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L8_M16_80 - - COPY4x8 - -.Lsgemm_tcopy_L8_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L8_M16_100 - - COPY2x8 - -.Lsgemm_tcopy_L8_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L8_M16_END - - COPY1x8 - -.Lsgemm_tcopy_L8_M16_END: - - subs J , J, #1 // j-- - bne .Lsgemm_tcopy_L8_M16_BEGIN - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L4_BEGIN: - tst M, #7 - ble .Lsgemm_tcopy_L999 - - tst M, #4 - ble .Lsgemm_tcopy_L2_BEGIN - -.Lsgemm_tcopy_L4_M16_BEGIN: - - mov A01, A - add A02, A01, LDA - add A03, A02, LDA - add A04, A03, LDA - add A, A04, LDA - - mov B00, B - add B, B00, #256 // B = B + 4 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L4_M16_40 - - .align 5 -.Lsgemm_tcopy_L4_M16_20: - - COPY16x4 - - subs I , I , #1 - bne .Lsgemm_tcopy_L4_M16_20 - -.Lsgemm_tcopy_L4_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L4_M16_60 - - COPY8x4 - -.Lsgemm_tcopy_L4_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L4_M16_80 - - COPY4x4 - -.Lsgemm_tcopy_L4_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L4_M16_100 - - COPY2x4 - - -.Lsgemm_tcopy_L4_M16_100: - - tst N, #1 - ble .Lsgemm_tcopy_L4_M16_END - - COPY1x4 - - -.Lsgemm_tcopy_L4_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L2_BEGIN: - - tst M, #3 - ble .Lsgemm_tcopy_L999 - - tst M, #2 - ble .Lsgemm_tcopy_L1_BEGIN - -.Lsgemm_tcopy_L2_M16_BEGIN: - mov A01, A - add A02, A01, LDA - add A, A02, LDA - - mov B00, B - add B, B00, #128 // B = B + 2 * 16 * SIZE - - asr I, N, #4 // I = N / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L2_M16_40 - - .align 5 -.Lsgemm_tcopy_L2_M16_20: - - COPY16x2 - - subs I , I , #1 - bne .Lsgemm_tcopy_L2_M16_20 - -.Lsgemm_tcopy_L2_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L2_M16_60 - - COPY8x2 - -.Lsgemm_tcopy_L2_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L2_M16_80 - - COPY4x2 - -.Lsgemm_tcopy_L2_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L2_M16_100 - - COPY2x2 - -.Lsgemm_tcopy_L2_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L2_M16_END - - COPY1x2 - -.Lsgemm_tcopy_L2_M16_END: - -/*********************************************************************************************/ - -.Lsgemm_tcopy_L1_BEGIN: - - tst M, #1 - ble .Lsgemm_tcopy_L999 - - -.Lsgemm_tcopy_L1_M16_BEGIN: - - mov A01, A // A01 = A - mov B00, B - - asr I, N, #4 // I = M / 16 - cmp I, #0 - ble .Lsgemm_tcopy_L1_M16_40 - - .align 5 -.Lsgemm_tcopy_L1_M16_20: - - COPY16x1 - - subs I , I , #1 - bne .Lsgemm_tcopy_L1_M16_20 - -.Lsgemm_tcopy_L1_M16_40: - tst N , #8 - ble .Lsgemm_tcopy_L1_M16_60 - - COPY8x1 - -.Lsgemm_tcopy_L1_M16_60: - tst N , #4 - ble .Lsgemm_tcopy_L1_M16_80 - - COPY4x1 - -.Lsgemm_tcopy_L1_M16_80: - - tst N , #2 - ble .Lsgemm_tcopy_L1_M16_100 - - COPY2x1 - -.Lsgemm_tcopy_L1_M16_100: - - tst N , #1 - ble .Lsgemm_tcopy_L1_M16_END - - COPY1x1 - - -.Lsgemm_tcopy_L1_M16_END: - -.Lsgemm_tcopy_L999: - mov x0, #0 // set return value - RESTORE_REGS - ret - - EPILOGUE - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define A x2 +#define LDA x3 +#define B x4 + +#define M8 x5 + +#define A01 x6 +#define A02 x7 +#define A03 x8 +#define A04 x9 +#define A05 x10 +#define A06 x11 +#define A07 x12 +#define A08 x13 + +#define B01 x14 +#define B02 x15 +#define B03 x16 +#define B04 x17 +#define B00 x22 + + +#define I x21 +#define J x19 + +#define TEMP1 x20 + +#define A_PREFETCH 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + //prfm PSTL1KEEP, [B00, M8] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05] + add A05, A05, #64 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06] + add A06, A06, #64 + + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07] + add A07, A07, #64 + + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08] + add A08, A08, #64 + + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + add B00, B00, M8 + +.endm + +.macro COPY8x8 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + prfm PLDL1KEEP, [A05, #A_PREFETCH] + prfm PLDL1KEEP, [A06, #A_PREFETCH] + prfm PLDL1KEEP, [A07, #A_PREFETCH] + prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 + + ldp q8, q9, [A05] + ldp q10, q11, [A06] + add A05, A05, #32 + add A06, A06, #32 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01] + add B01, B01, #64 + + ldp q12, q13, [A07] + ldp q14, q15, [A08] + add A07, A07, #32 + add A08, A08, #32 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + add B02, B02, #64 + + ldr q4, [A05] + ldr q5, [A06] + ldr q6, [A07] + ldr q7, [A08] + + add A05, A05, #16 + add A06, A06, #16 + add A07, A07, #16 + add A08, A08, #16 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02] + add B02, B02, #64 +.endm + +.macro COPY2x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + add B03, B03, #16 + + ldr d4, [A05] + ldr d5, [A06] + ldr d6, [A07] + ldr d7, [A08] + + add A05, A05, #8 + add A06, A06, #8 + add A07, A07, #8 + add A08, A08, #8 + + stp d4, d5, [B03] + add B03, B03, #16 + stp d6, d7, [B03] + add B03, B03, #16 + +.endm + +.macro COPY1x8 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + //prfm PLDL1KEEP, [A05, #A_PREFETCH] + //prfm PLDL1KEEP, [A06, #A_PREFETCH] + //prfm PLDL1KEEP, [A07, #A_PREFETCH] + //prfm PLDL1KEEP, [A08, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + + ldr s4, [A05] + ldr s5, [A06] + ldr s6, [A07] + ldr s7, [A08] + + stp s4, s5, [B04] + add B04, B04, #8 + stp s6, s7, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ +.macro COPY16x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03] + add A03, A03, #64 + + st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] + add TEMP1, TEMP1, #64 + + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04] + add A04, A04, #64 + + st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] + + add B00, B00, M8 +.endm + +.macro COPY8x4 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + prfm PLDL1KEEP, [A03, #A_PREFETCH] + prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 + + ldp q4, q5, [A03] + ldp q6, q7, [A04] + add A03, A03, #32 + add A04, A04, #32 + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + ldr q2, [A03] + ldr q3, [A04] + add A01, A01, #16 + add A02, A02, #16 + add A03, A03, #16 + add A04, A04, #16 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02] + + add B02, B02, #64 +.endm + +.macro COPY2x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + ldr d2, [A03] + ldr d3, [A04] + + add A01, A01, #8 + add A02, A02, #8 + add A03, A03, #8 + add A04, A04, #8 + + stp d0, d1, [B03] + add B03, B03, #16 + stp d2, d3, [B03] + + add B03, B03, #16 +.endm + +.macro COPY1x4 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + //prfm PLDL1KEEP, [A03, #A_PREFETCH] + //prfm PLDL1KEEP, [A04, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + ldr s2, [A03] + ldr s3, [A04] + + add A01, A01, #4 + add A02, A02, #4 + add A03, A03, #4 + add A04, A04, #4 + + stp s0, s1, [B04] + add B04, B04, #8 + stp s2, s3, [B04] + add B04, B04, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02] + add A02, A02, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add TEMP1, B00, #64 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] + add B00, B00, M8 +.endm + +.macro COPY8x2 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ld1 {v0.4s, v1.4s}, [A01] + ld1 {v2.4s, v3.4s}, [A02] + add A01, A01, #32 + add A02, A02, #32 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] + add B01, B01, #64 +.endm + +.macro COPY4x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr q0, [A01] + ldr q1, [A02] + add A01, A01, #16 + add A02, A02, #16 + + stp q0, q1, [B02] + add B02, B02, #32 +.endm + +.macro COPY2x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr d0, [A01] + ldr d1, [A02] + + add A01, A01, #8 + add A02, A02, #8 + + stp d0, d1, [B03] + add B03, B03, #16 +.endm + +.macro COPY1x2 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + //prfm PLDL1KEEP, [A02, #A_PREFETCH] + + ldr s0, [A01] + ldr s1, [A02] + + add A01, A01, #4 + add A02, A02, #4 + + stp s0, s1, [B04] + + add B04, B04, #8 +.endm + +/*************************************************************************************************************************/ + +.macro COPY16x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01] + add A01, A01, #64 + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] + add B00, B00, M8 +.endm + +.macro COPY8x1 + prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldp q0, q1, [A01] + add A01, A01, #32 + stp q0, q1, [B01] + + add B01, B01, #32 +.endm + +.macro COPY4x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr q0, [A01] + add A01, A01, #16 + str q0, [B02] + + add B02, B02, #16 +.endm + +.macro COPY2x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr d0, [A01] + add A01, A01, #8 + str d0, [B03] + + add B03, B03, #8 +.endm + +.macro COPY1x1 + //prfm PLDL1KEEP, [A01, #A_PREFETCH] + + ldr s0, [A01] + add A01, A01, #4 + str s0, [B04] + + add B04, B04, #4 +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + SAVE_REGS + + lsl LDA, LDA, #2 // LDA = LDA * SIZE + + lsl TEMP1, M, #2 // TEMP1 = M * SIZE + + and B01 , N , #-16 + and B02 , N , #-8 + and B03 , N , #-4 + and B04 , N , #-2 + + mul B01, B01, TEMP1 + mul B02, B02, TEMP1 + mul B03, B03, TEMP1 + mul B04, B04, TEMP1 + + add B01 , B01, B + add B02 , B02, B + add B03 , B03, B + add B04 , B04, B + + lsl M8, M, #6 // M8 = M * 16 * SIZE + +.Lsgemm_tcopy_L8_BEGIN: + asr J, M, #3 // J = M / 8 + cmp J, #0 + ble .Lsgemm_tcopy_L4_BEGIN + + .align 5 +.Lsgemm_tcopy_L8_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A05, A04, LDA + add A06, A05, LDA + add A07, A06, LDA + add A08, A07, LDA + add A, A08, LDA + + mov B00, B + add B, B00, #512 // B = B + 8 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L8_M16_40 + + .align 5 +.Lsgemm_tcopy_L8_M16_20: + + COPY16x8 + + subs I , I , #1 + bne .Lsgemm_tcopy_L8_M16_20 + +.Lsgemm_tcopy_L8_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L8_M16_60 + + COPY8x8 + +.Lsgemm_tcopy_L8_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L8_M16_80 + + COPY4x8 + +.Lsgemm_tcopy_L8_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L8_M16_100 + + COPY2x8 + +.Lsgemm_tcopy_L8_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L8_M16_END + + COPY1x8 + +.Lsgemm_tcopy_L8_M16_END: + + subs J , J, #1 // j-- + bne .Lsgemm_tcopy_L8_M16_BEGIN + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L4_BEGIN: + tst M, #7 + ble .Lsgemm_tcopy_L999 + + tst M, #4 + ble .Lsgemm_tcopy_L2_BEGIN + +.Lsgemm_tcopy_L4_M16_BEGIN: + + mov A01, A + add A02, A01, LDA + add A03, A02, LDA + add A04, A03, LDA + add A, A04, LDA + + mov B00, B + add B, B00, #256 // B = B + 4 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L4_M16_40 + + .align 5 +.Lsgemm_tcopy_L4_M16_20: + + COPY16x4 + + subs I , I , #1 + bne .Lsgemm_tcopy_L4_M16_20 + +.Lsgemm_tcopy_L4_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L4_M16_60 + + COPY8x4 + +.Lsgemm_tcopy_L4_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L4_M16_80 + + COPY4x4 + +.Lsgemm_tcopy_L4_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L4_M16_100 + + COPY2x4 + + +.Lsgemm_tcopy_L4_M16_100: + + tst N, #1 + ble .Lsgemm_tcopy_L4_M16_END + + COPY1x4 + + +.Lsgemm_tcopy_L4_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble .Lsgemm_tcopy_L999 + + tst M, #2 + ble .Lsgemm_tcopy_L1_BEGIN + +.Lsgemm_tcopy_L2_M16_BEGIN: + mov A01, A + add A02, A01, LDA + add A, A02, LDA + + mov B00, B + add B, B00, #128 // B = B + 2 * 16 * SIZE + + asr I, N, #4 // I = N / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L2_M16_40 + + .align 5 +.Lsgemm_tcopy_L2_M16_20: + + COPY16x2 + + subs I , I , #1 + bne .Lsgemm_tcopy_L2_M16_20 + +.Lsgemm_tcopy_L2_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L2_M16_60 + + COPY8x2 + +.Lsgemm_tcopy_L2_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L2_M16_80 + + COPY4x2 + +.Lsgemm_tcopy_L2_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L2_M16_100 + + COPY2x2 + +.Lsgemm_tcopy_L2_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L2_M16_END + + COPY1x2 + +.Lsgemm_tcopy_L2_M16_END: + +/*********************************************************************************************/ + +.Lsgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble .Lsgemm_tcopy_L999 + + +.Lsgemm_tcopy_L1_M16_BEGIN: + + mov A01, A // A01 = A + mov B00, B + + asr I, N, #4 // I = M / 16 + cmp I, #0 + ble .Lsgemm_tcopy_L1_M16_40 + + .align 5 +.Lsgemm_tcopy_L1_M16_20: + + COPY16x1 + + subs I , I , #1 + bne .Lsgemm_tcopy_L1_M16_20 + +.Lsgemm_tcopy_L1_M16_40: + tst N , #8 + ble .Lsgemm_tcopy_L1_M16_60 + + COPY8x1 + +.Lsgemm_tcopy_L1_M16_60: + tst N , #4 + ble .Lsgemm_tcopy_L1_M16_80 + + COPY4x1 + +.Lsgemm_tcopy_L1_M16_80: + + tst N , #2 + ble .Lsgemm_tcopy_L1_M16_100 + + COPY2x1 + +.Lsgemm_tcopy_L1_M16_100: + + tst N , #1 + ble .Lsgemm_tcopy_L1_M16_END + + COPY1x1 + + +.Lsgemm_tcopy_L1_M16_END: + +.Lsgemm_tcopy_L999: + mov x0, #0 // set return value + RESTORE_REGS + ret + + EPILOGUE + + diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index 84568ee0b8..ba7c64a9aa 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -43,7 +43,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT dot = 0.0 ; #endif - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { diff --git a/kernel/generic/lsame.c b/kernel/generic/lsame.c index cae8b4ae3d..83fff17982 100644 --- a/kernel/generic/lsame.c +++ b/kernel/generic/lsame.c @@ -38,7 +38,7 @@ #include -int NAME(char *A, char *B){ +int NAME(const char *A, const char *B){ char a = *A; char b = *B; diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c index e02e10c610..8c250d401f 100644 --- a/kernel/mips/sdot_msa.c +++ b/kernel/mips/sdot_msa.c @@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; +#if defined(DSDOT) + v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7; + v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7; + v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 dot2 = {0, 0}; + v2f64 dot3 = {0, 0}; +#else v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; v4f32 dot2 = {0, 0, 0, 0}; v4f32 dot3 = {0, 0, 0, 0}; +#endif if (n < 1) return (dot); @@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x_pref += 32; y_pref += 32; +#if defined(DSDOT) + /* Extend single precision to double precision */ + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + dvy4 = __msa_fexupr_d(vy4); + dvy5 = __msa_fexupr_d(vy5); + dvy6 = __msa_fexupr_d(vy6); + dvy7 = __msa_fexupr_d(vy7); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + vy4 = (v4f32)__msa_fexupl_d(vy4); + vy5 = (v4f32)__msa_fexupl_d(vy5); + vy6 = (v4f32)__msa_fexupl_d(vy6); + vy7 = (v4f32)__msa_fexupl_d(vy7); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + dvx4 = __msa_fexupr_d(vx4); + dvx5 = __msa_fexupr_d(vx5); + dvx6 = __msa_fexupr_d(vx6); + dvx7 = __msa_fexupr_d(vx7); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + vx4 = (v4f32)__msa_fexupl_d(vx4); + vx5 = (v4f32)__msa_fexupl_d(vx5); + vx6 = (v4f32)__msa_fexupl_d(vx6); + vx7 = (v4f32)__msa_fexupl_d(vx7); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += (dvy4 * dvx4); + dot1 += (dvy5 * dvx5); + dot2 += (dvy6 * dvx6); + dot3 += (dvy7 * dvx7); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); + dot0 += ((v2f64)vy4 * (v2f64)vx4); + dot1 += ((v2f64)vy5 * (v2f64)vx5); + dot2 += ((v2f64)vy6 * (v2f64)vx6); + dot3 += ((v2f64)vy7 * (v2f64)vx7); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); @@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot1 += (vy5 * vx5); dot2 += (vy6 * vx6); dot3 += (vy7 * vx7); +#endif } if (n & 31) @@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); +#endif } if (n & 8) @@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP2_INC(x, 4, vx0, vx1); LD_SP2_INC(y, 4, vy0, vy1); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); +#endif } if (n & 4) @@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vx0 = LD_SP(x); x += 4; vy0 = LD_SP(y); y += 4; +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + vy0 = (v4f32)__msa_fexupl_d(vy0); + dvx0 = __msa_fexupr_d(vx0); + vx0 = (v4f32)__msa_fexupl_d(vx0); + dot0 += (dvy0 * dvx0); + dot0 += ((v2f64)vy0 * (v2f64)vx0); +#else dot0 += (vy0 * vx0); +#endif } if (n & 2) @@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } @@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += dot0[0]; dot += dot0[1]; +#if !defined(DSDOT) dot += dot0[2]; dot += dot0[3]; +#endif } else { @@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(y, inc_y, y0, y1, y2, y3); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); + dot += ((double)y2 * (double)x2); + dot += ((double)y3 * (double)x3); +#else dot += (y0 * x0); dot += (y1 * x1); dot += (y2 * x2); dot += (y3 * x3); +#endif } if (n & 2) @@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 97ef3692c9..54939a9efe 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -42,50 +42,58 @@ endif ifndef SGEMMKERNEL SGEMMKERNEL = gemm_kernel.S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = ../generic/gemm_ncopy_2.c SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) endif ifndef DGEMMKERNEL DGEMMKERNEL = gemm_kernel.S +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = ../generic/gemm_ncopy_2.c DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif DGEMMONCOPY = ../generic/gemm_ncopy_8.c DGEMMOTCOPY = ../generic/gemm_tcopy_8.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) endif ifndef CGEMMKERNEL CGEMMKERNEL = zgemm_kernel.S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_1.c CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) endif ifndef ZGEMMKERNEL ZGEMMKERNEL = zgemm_kernel.S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMINCOPYOBJ = zgemm_incopy.o -ZGEMMITCOPYOBJ = zgemm_itcopy.o -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) endif ifndef SGEMM_BETA diff --git a/kernel/mips64/KERNEL.MIPS64_GENERIC b/kernel/mips64/KERNEL.MIPS64_GENERIC new file mode 100644 index 0000000000..33bcbeedd5 --- /dev/null +++ b/kernel/mips64/KERNEL.MIPS64_GENERIC @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index 0ccc781e18..cd40414a25 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -90,7 +90,7 @@ //Init INF lui TEMP, 0x7FF0 dsll TEMP, TEMP, 32 - MTC1 TEMP, INF + MTC TEMP, INF LD a1, 0 * SIZE(X) daddiu N, N, -1 diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S index 4b5c2fa313..dfe17f3ef0 100644 --- a/kernel/power/cgemm_kernel_power9.S +++ b/kernel/power/cgemm_kernel_power9.S @@ -1,293 +1,293 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - -#define alpha_r vs19 -#define alpha_i vs20 -#define save_permute_1 vs21 -#define permute_mask vs22 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define PRE r29 - -#define T12 r30 -#define T13 r31 - -#include "cgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_12, 0x0c0d0e0f1c1d1e1f -.equ save_permute_11, 0x0405060714151617 - - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - - - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) - - - -#ifdef TRMMKERNEL - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) -#endif - slwi LDC, LDC, ZBASE_SHIFT - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xscvdpspn alpha_i,vs2 - xxspltw alpha_r,alpha_r,0 - xxspltw alpha_i,alpha_i,0 -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - - - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - - - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - - - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - - - li r0,0 - li PRE,512 - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegsp alpha_r,alpha_r - xvnegsp alpha_i,alpha_i -#endif - - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - - /*mask is reverse permute so we have to make it inner permute */ - xxpermdi permute_mask, permute_mask, permute_mask,2 - -#include "cgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs19 +#define alpha_i vs20 +#define save_permute_1 vs21 +#define permute_mask vs22 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S index b4f937e90b..a191219fa1 100644 --- a/kernel/power/cgemm_logic_power9.S +++ b/kernel/power/cgemm_logic_power9.S @@ -1,2816 +1,2816 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define MY_ALIGN .align 3 -b CGEMM_L4 -/* MINI SUBROUTINES */ -/* 4x8 MAIN 128x+2 LOOP */ - - -CGEMM_L4x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x8_2 - MY_ALIGN -CGEMM_L4x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 -CGEMM_L4x8_K128: -/*----------------------------------------*/ - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_L2 128,64,31,0 - KERNEL4x8_L2 128,64,32,0 - KERNEL4x8_L2 128,64,33,0 - KERNEL4x8_L2 128,64,34,0 - KERNEL4x8_L2 128,64,35,0 - KERNEL4x8_L2 128,64,36,0 - KERNEL4x8_L2 128,64,37,0 - KERNEL4x8_L2 128,64,38,0 - KERNEL4x8_L2 128,64,39,0 - KERNEL4x8_L2 128,64,40,0 - KERNEL4x8_L2 128,64,41,0 - KERNEL4x8_L2 128,64,42,0 - KERNEL4x8_L2 128,64,43,0 - KERNEL4x8_L2 128,64,44,0 - KERNEL4x8_L2 128,64,45,0 - KERNEL4x8_L2 128,64,46,0 - KERNEL4x8_L2 128,64,47,0 - KERNEL4x8_L2 128,64,48,0 - KERNEL4x8_L2 128,64,49,0 - KERNEL4x8_L2 128,64,50,0 - KERNEL4x8_L2 128,64,51,0 - KERNEL4x8_L2 128,64,52,0 - KERNEL4x8_L2 128,64,53,0 - KERNEL4x8_L2 128,64,54,0 - KERNEL4x8_L2 128,64,55,0 - KERNEL4x8_L2 128,64,56,0 - KERNEL4x8_L2 128,64,57,0 - KERNEL4x8_L2 128,64,58,0 - KERNEL4x8_L2 128,64,59,0 - KERNEL4x8_L2 128,64,60,0 - KERNEL4x8_L2 128,64,61,0 - KERNEL4x8_L2 128,64,62,0 - KERNEL4x8_L2 128,64,63,1 - bdnz CGEMM_L4x8_LOOP - MY_ALIGN -CGEMM_L4x8_LOOP_END: -/*----------------------------------------*/ - END4x8_2 - blr - MY_ALIGN - - -CGEMM_4x8_L64_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_E2 128,64,31,1 - blr - MY_ALIGN - - -CGEMM_4x8_L32_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_E2 128,64,15,1 - blr - MY_ALIGN - - -CGEMM_4x8_L16_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_E2 128,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x4_2 - MY_ALIGN -CGEMM_L4x4_LOOP: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,0,0 -CGEMM_L4x4_K32: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_L2 64,64,7,0 - KERNEL4x4_L2 64,64,8,0 - KERNEL4x4_L2 64,64,9,0 - KERNEL4x4_L2 64,64,10,0 - KERNEL4x4_L2 64,64,11,0 - KERNEL4x4_L2 64,64,12,0 - KERNEL4x4_L2 64,64,13,0 - KERNEL4x4_L2 64,64,14,0 - KERNEL4x4_L2 64,64,15,1 - bdnz CGEMM_L4x4_LOOP - MY_ALIGN -CGEMM_L4x4_LOOP_END: -/*----------------------------------------*/ - END4x4_2 - blr - MY_ALIGN - - -CGEMM_4x4_L16_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_E2 64,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_L8_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_E2 64,64,3,1 - blr - - -CGEMM_4x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x2_2 - MY_ALIGN -CGEMM_L4x2_LOOP: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,0,0 -CGEMM_L4x2_K32: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_L2 32,64,7,0 - KERNEL4x2_L2 32,64,8,0 - KERNEL4x2_L2 32,64,9,0 - KERNEL4x2_L2 32,64,10,0 - KERNEL4x2_L2 32,64,11,0 - KERNEL4x2_L2 32,64,12,0 - KERNEL4x2_L2 32,64,13,0 - KERNEL4x2_L2 32,64,14,0 - KERNEL4x2_L2 32,64,15,1 - bdnz CGEMM_L4x2_LOOP - MY_ALIGN - - -CGEMM_L4x2_LOOP_END: -/*----------------------------------------*/ - END4x2_2 - blr - MY_ALIGN -CGEMM_4x2_L16_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_E2 32,64,7,1 - blr - MY_ALIGN -CGEMM_4x2_L8_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_E2 32,64,3,1 - blr - - -CGEMM_4x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x1_2 - MY_ALIGN -CGEMM_L4x1_LOOP: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,0,0 -CGEMM_L4x1_K32: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_L2 16,64,7,0 - KERNEL4x1_L2 16,64,8,0 - KERNEL4x1_L2 16,64,9,0 - KERNEL4x1_L2 16,64,10,0 - KERNEL4x1_L2 16,64,11,0 - KERNEL4x1_L2 16,64,12,0 - KERNEL4x1_L2 16,64,13,0 - KERNEL4x1_L2 16,64,14,0 - KERNEL4x1_L2 16,64,15,1 - bdnz CGEMM_L4x1_LOOP - MY_ALIGN -CGEMM_L4x1_LOOP_END: -/*----------------------------------------*/ - END4x1_2 - blr - - MY_ALIGN -CGEMM_4x1_L16_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_E2 16,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x1_L8_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_E2 16,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L4: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 2 - ble CGEMM_L4_END - - -CGEMM_L4_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 2 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L4x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L4x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO4x8 - ble CGEMM_L4x8_SUB0 - bl CGEMM_L4x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L4x8_SAVE - b CGEMM_L4x8_SUB2 - - -CGEMM_L4x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP4x8_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD4x8O 64,32 - END4x8_WITHOUT_ADD - LOAD4x8_2O 128, 64 - mtctr T8 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - CMP4x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L4x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD4x8_2O 128,64 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - MY_ALIGN - - -CGEMM_L4x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L4x8_SUB2_32 - bl CGEMM_4x8_L64_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L4x8_SUB2_16 - bl CGEMM_4x8_L32_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x8_SUB2_8 - bl CGEMM_4x8_L16_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x8_SUB2_4 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_L2 128,64, 1,0 - KERNEL4x8_L2 128,64, 2,0 - KERNEL4x8_E2 128,64, 3,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x8_SUB2_2 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_E2 128,64, 1,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x8_SUB2_1 - LOAD4x8_2 - KERNEL4x8_E2 128,64, 0,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x8_SAVE - KERNEL4x8 - - MY_ALIGN -CGEMM_L4x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 -#endif - bgt CGEMM_L4x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END - b CGEMM_L4x4_BEGIN - MY_ALIGN - - -CGEMM_L4x8_END: -/*----------------------------------------*/ - - -CGEMM_L4x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x4 - ble CGEMM_L4x4_SUB0 - bl CGEMM_4x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x4_SAVE - b CGEMM_L4x4_SUB2 - - -CGEMM_L4x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x4_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD4x4O 32,32 - END4x4_WITHOUT_ADD - LOAD4x4_2O 64, 64 - mtctr T8 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - CMP4x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD4x4_2O 64,64 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x4_SUB2_8 - bl CGEMM_4x4_L16_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x4_SUB2_4 - bl CGEMM_4x4_L8_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x4_SUB2_2 - LOAD4x4_2 - KERNEL4x4_L2 64,64, 0,0 - KERNEL4x4_E2 64,64, 1,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x4_SUB2_1 - LOAD4x4_2 - KERNEL4x4_E2 64,64, 0,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x4_SAVE - KERNEL4x4 - - -CGEMM_L4x4_SAVE: -/*----------------------------------------*/ - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 -#endif - - -CGEMM_L4x4_END: -/*----------------------------------------*/ - - -CGEMM_L4x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x2 - ble CGEMM_L4x2_SUB0 - bl CGEMM_4x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x2_SAVE - b CGEMM_L4x2_SUB2 - - -CGEMM_L4x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x2_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD4x2O 16,32 - END4x2_WITHOUT_ADD - LOAD4x2_2O 32, 64 - mtctr T8 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - CMP4x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD4x2_2O 32,64 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x2_SUB2_8 - bl CGEMM_4x2_L16_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x2_SUB2_4 - bl CGEMM_4x2_L8_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x2_SUB2_2 - LOAD4x2_2 - KERNEL4x2_L2 32,64, 0,0 - KERNEL4x2_E2 32,64, 1,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x2_SUB2_1 - LOAD4x2_2 - KERNEL4x2_E2 32,64, 0,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -CGEMM_L4x2_SAVE: -/*----------------------------------------*/ - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 -#endif - - -CGEMM_L4x2_END: -/*----------------------------------------*/ - - -CGEMM_L4x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x1 - ble CGEMM_L4x1_SUB0 - bl CGEMM_4x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x1_SAVE - b CGEMM_L4x1_SUB2 - - -CGEMM_L4x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x1_32K - addi BO,BO,-32 - addi AO,AO,-8 - LOAD4x1O 8,32 - END4x1_WITHOUT_ADD - LOAD4x1_2O 16, 64 - mtctr T8 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - CMP4x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-16 - LOAD4x1_2O 16,64 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x1_SUB2_8 - bl CGEMM_4x1_L16_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x1_SUB2_4 - bl CGEMM_4x1_L8_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x1_SUB2_2 - LOAD4x1_2 - KERNEL4x1_L2 16,64, 0,0 - KERNEL4x1_E2 16,64, 1,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x1_SUB2_1 - LOAD4x1_2 - KERNEL4x1_E2 16,64, 0,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -CGEMM_L4x1_SAVE: -/*----------------------------------------*/ - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 -#endif - - -CGEMM_L4x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - bgt CGEMM_L4_BEGIN - - -CGEMM_L4_END: - -b CGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -CGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -CGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 -CGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_L2 128,32,31,0 - KERNEL2x8_L2 128,32,32,0 - KERNEL2x8_L2 128,32,33,0 - KERNEL2x8_L2 128,32,34,0 - KERNEL2x8_L2 128,32,35,0 - KERNEL2x8_L2 128,32,36,0 - KERNEL2x8_L2 128,32,37,0 - KERNEL2x8_L2 128,32,38,0 - KERNEL2x8_L2 128,32,39,0 - KERNEL2x8_L2 128,32,40,0 - KERNEL2x8_L2 128,32,41,0 - KERNEL2x8_L2 128,32,42,0 - KERNEL2x8_L2 128,32,43,0 - KERNEL2x8_L2 128,32,44,0 - KERNEL2x8_L2 128,32,45,0 - KERNEL2x8_L2 128,32,46,0 - KERNEL2x8_L2 128,32,47,0 - KERNEL2x8_L2 128,32,48,0 - KERNEL2x8_L2 128,32,49,0 - KERNEL2x8_L2 128,32,50,0 - KERNEL2x8_L2 128,32,51,0 - KERNEL2x8_L2 128,32,52,0 - KERNEL2x8_L2 128,32,53,0 - KERNEL2x8_L2 128,32,54,0 - KERNEL2x8_L2 128,32,55,0 - KERNEL2x8_L2 128,32,56,0 - KERNEL2x8_L2 128,32,57,0 - KERNEL2x8_L2 128,32,58,0 - KERNEL2x8_L2 128,32,59,0 - KERNEL2x8_L2 128,32,60,0 - KERNEL2x8_L2 128,32,61,0 - KERNEL2x8_L2 128,32,62,0 - KERNEL2x8_L2 128,32,63,1 - bdnz CGEMM_L2x8_LOOP - MY_ALIGN -CGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -CGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_E2 128,32,31,1 - blr - MY_ALIGN - - -CGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_E2 128,32,15,1 - blr - MY_ALIGN - - -CGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_E2 128,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -CGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,0,0 -CGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_L2 64,32,7,0 - KERNEL2x4_L2 64,32,8,0 - KERNEL2x4_L2 64,32,9,0 - KERNEL2x4_L2 64,32,10,0 - KERNEL2x4_L2 64,32,11,0 - KERNEL2x4_L2 64,32,12,0 - KERNEL2x4_L2 64,32,13,0 - KERNEL2x4_L2 64,32,14,0 - KERNEL2x4_L2 64,32,15,1 - bdnz CGEMM_L2x4_LOOP - MY_ALIGN -CGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -CGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_E2 64,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_E2 64,32,3,1 - blr - - -CGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -CGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,0,0 -CGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_L2 32,32,7,0 - KERNEL2x2_L2 32,32,8,0 - KERNEL2x2_L2 32,32,9,0 - KERNEL2x2_L2 32,32,10,0 - KERNEL2x2_L2 32,32,11,0 - KERNEL2x2_L2 32,32,12,0 - KERNEL2x2_L2 32,32,13,0 - KERNEL2x2_L2 32,32,14,0 - KERNEL2x2_L2 32,32,15,1 - bdnz CGEMM_L2x2_LOOP - MY_ALIGN - - -CGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -CGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_E2 32,32,7,1 - blr - MY_ALIGN -CGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_E2 32,32,3,1 - blr - - -CGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -CGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,0,0 -CGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_L2 16,32,7,0 - KERNEL2x1_L2 16,32,8,0 - KERNEL2x1_L2 16,32,9,0 - KERNEL2x1_L2 16,32,10,0 - KERNEL2x1_L2 16,32,11,0 - KERNEL2x1_L2 16,32,12,0 - KERNEL2x1_L2 16,32,13,0 - KERNEL2x1_L2 16,32,14,0 - KERNEL2x1_L2 16,32,15,1 - bdnz CGEMM_L2x1_LOOP - MY_ALIGN -CGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -CGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_E2 16,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_E2 16,32,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L2: -/*----------------------------------------*/ - - andi. J, N, 2 - ble CGEMM_L2_END - - -CGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble CGEMM_L2x8_SUB0 - bl CGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L2x8_SAVE - b CGEMM_L2x8_SUB2 - - -CGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD2x8O 64,16 - END2x8_WITHOUT_ADD - LOAD2x8_2O 128, 32 - mtctr T8 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8_2O 128,32 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - MY_ALIGN - - -CGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L2x8_SUB2_32 - bl CGEMM_2x8_L64_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L2x8_SUB2_16 - bl CGEMM_2x8_L32_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x8_SUB2_8 - bl CGEMM_2x8_L16_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_L2 128,32, 1,0 - KERNEL2x8_L2 128,32, 2,0 - KERNEL2x8_E2 128,32, 3,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_E2 128,32, 1,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 128,32, 0,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -CGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt CGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END - b CGEMM_L2x4_BEGIN - MY_ALIGN - - -CGEMM_L2x8_END: -/*----------------------------------------*/ - - -CGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble CGEMM_L2x4_SUB0 - bl CGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x4_SAVE - b CGEMM_L2x4_SUB2 - - -CGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD2x4O 32,16 - END2x4_WITHOUT_ADD - LOAD2x4_2O 64, 32 - mtctr T8 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4_2O 64,32 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x4_SUB2_8 - bl CGEMM_2x4_L16_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x4_SUB2_4 - bl CGEMM_2x4_L8_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 64,32, 0,0 - KERNEL2x4_E2 64,32, 1,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 64,32, 0,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x4_SAVE - KERNEL2x4 - - -CGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -CGEMM_L2x4_END: -/*----------------------------------------*/ - - -CGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble CGEMM_L2x2_SUB0 - bl CGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x2_SAVE - b CGEMM_L2x2_SUB2 - - -CGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD2x2O 16,16 - END2x2_WITHOUT_ADD - LOAD2x2_2O 32, 32 - mtctr T8 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2_2O 32,32 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x2_SUB2_8 - bl CGEMM_2x2_L16_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x2_SUB2_4 - bl CGEMM_2x2_L8_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 32,32, 0,0 - KERNEL2x2_E2 32,32, 1,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 32,32, 0,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -CGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -CGEMM_L2x2_END: -/*----------------------------------------*/ - - -CGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble CGEMM_L2x1_SUB0 - bl CGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x1_SAVE - b CGEMM_L2x1_SUB2 - - -CGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-16 - addi AO,AO,-8 - LOAD2x1O 8,16 - END2x1_WITHOUT_ADD - LOAD2x1_2O 16, 32 - mtctr T8 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1_2O 16,32 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x1_SUB2_8 - bl CGEMM_2x1_L16_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x1_SUB2_4 - bl CGEMM_2x1_L8_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 16,32, 0,0 - KERNEL2x1_E2 16,32, 1,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 16,32, 0,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -CGEMM_L2x1_SAVE: -/*----------------------------------------*/ - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -CGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 4 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - -CGEMM_L2_END: - - -b CGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -CGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -CGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 -CGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_L2 128,16,31,0 - KERNEL1x8_L2 128,16,32,0 - KERNEL1x8_L2 128,16,33,0 - KERNEL1x8_L2 128,16,34,0 - KERNEL1x8_L2 128,16,35,0 - KERNEL1x8_L2 128,16,36,0 - KERNEL1x8_L2 128,16,37,0 - KERNEL1x8_L2 128,16,38,0 - KERNEL1x8_L2 128,16,39,0 - KERNEL1x8_L2 128,16,40,0 - KERNEL1x8_L2 128,16,41,0 - KERNEL1x8_L2 128,16,42,0 - KERNEL1x8_L2 128,16,43,0 - KERNEL1x8_L2 128,16,44,0 - KERNEL1x8_L2 128,16,45,0 - KERNEL1x8_L2 128,16,46,0 - KERNEL1x8_L2 128,16,47,0 - KERNEL1x8_L2 128,16,48,0 - KERNEL1x8_L2 128,16,49,0 - KERNEL1x8_L2 128,16,50,0 - KERNEL1x8_L2 128,16,51,0 - KERNEL1x8_L2 128,16,52,0 - KERNEL1x8_L2 128,16,53,0 - KERNEL1x8_L2 128,16,54,0 - KERNEL1x8_L2 128,16,55,0 - KERNEL1x8_L2 128,16,56,0 - KERNEL1x8_L2 128,16,57,0 - KERNEL1x8_L2 128,16,58,0 - KERNEL1x8_L2 128,16,59,0 - KERNEL1x8_L2 128,16,60,0 - KERNEL1x8_L2 128,16,61,0 - KERNEL1x8_L2 128,16,62,0 - KERNEL1x8_L2 128,16,63,1 - bdnz CGEMM_L1x8_LOOP - MY_ALIGN -CGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -CGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_E2 128,16,31,1 - blr - MY_ALIGN - - -CGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_E2 128,16,15,1 - blr - MY_ALIGN - - -CGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_E2 128,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN -CGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,0,0 -CGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_L2 64,16,7,0 - KERNEL1x4_L2 64,16,8,0 - KERNEL1x4_L2 64,16,9,0 - KERNEL1x4_L2 64,16,10,0 - KERNEL1x4_L2 64,16,11,0 - KERNEL1x4_L2 64,16,12,0 - KERNEL1x4_L2 64,16,13,0 - KERNEL1x4_L2 64,16,14,0 - KERNEL1x4_L2 64,16,15,1 - bdnz CGEMM_L1x4_LOOP - MY_ALIGN -CGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -CGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_E2 64,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_E2 64,16,3,1 - blr - - -CGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN -CGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,0,0 -CGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_L2 32,16,7,0 - KERNEL1x2_L2 32,16,8,0 - KERNEL1x2_L2 32,16,9,0 - KERNEL1x2_L2 32,16,10,0 - KERNEL1x2_L2 32,16,11,0 - KERNEL1x2_L2 32,16,12,0 - KERNEL1x2_L2 32,16,13,0 - KERNEL1x2_L2 32,16,14,0 - KERNEL1x2_L2 32,16,15,1 - bdnz CGEMM_L1x2_LOOP - MY_ALIGN - - -CGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN -CGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_E2 32,16,7,1 - blr - MY_ALIGN -CGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_E2 32,16,3,1 - blr - - -CGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN -CGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,0,0 -CGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_L2 16,16,7,0 - KERNEL1x1_L2 16,16,8,0 - KERNEL1x1_L2 16,16,9,0 - KERNEL1x1_L2 16,16,10,0 - KERNEL1x1_L2 16,16,11,0 - KERNEL1x1_L2 16,16,12,0 - KERNEL1x1_L2 16,16,13,0 - KERNEL1x1_L2 16,16,14,0 - KERNEL1x1_L2 16,16,15,1 - bdnz CGEMM_L1x1_LOOP - MY_ALIGN -CGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - - MY_ALIGN -CGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_E2 16,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_E2 16,16,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L1: -/*----------------------------------------*/ - - andi. J, N, 1 - ble CGEMM_L1_END - -CGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble CGEMM_L1x8_SUB0 - bl CGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L1x8_SAVE - b CGEMM_L1x8_SUB2 - - -CGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-8 - addi AO,AO,-64 - LOAD1x8O 64,8 - END1x8_WITHOUT_ADD - LOAD1x8_2O 128, 16 - mtctr T8 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8_2O 128,16 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - MY_ALIGN - - -CGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L1x8_SUB2_32 - bl CGEMM_1x8_L64_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L1x8_SUB2_16 - bl CGEMM_1x8_L32_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x8_SUB2_8 - bl CGEMM_1x8_L16_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_L2 128,16, 1,0 - KERNEL1x8_L2 128,16, 2,0 - KERNEL1x8_E2 128,16, 3,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_E2 128,16, 1,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 128,16, 0,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x8_SAVE - KERNEL1x8 - - MY_ALIGN -CGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt CGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END - b CGEMM_L1x4_BEGIN - MY_ALIGN - - -CGEMM_L1x8_END: -/*----------------------------------------*/ - - -CGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x4 - ble CGEMM_L1x4_SUB0 - bl CGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x4_SAVE - b CGEMM_L1x4_SUB2 - - -CGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-8 - addi AO,AO,-32 - LOAD1x4O 32,8 - END1x4_WITHOUT_ADD - LOAD1x4_2O 64, 16 - mtctr T8 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4_2O 64,16 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x4_SUB2_8 - bl CGEMM_1x4_L16_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x4_SUB2_4 - bl CGEMM_1x4_L8_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 64,16, 0,0 - KERNEL1x4_E2 64,16, 1,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 64,16, 0,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x4_SAVE - KERNEL1x4 - - -CGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -CGEMM_L1x4_END: -/*----------------------------------------*/ - - -CGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x2 - ble CGEMM_L1x2_SUB0 - bl CGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x2_SAVE - b CGEMM_L1x2_SUB2 - - -CGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-8 - addi AO,AO,-16 - LOAD1x2O 16,8 - END1x2_WITHOUT_ADD - LOAD1x2_2O 32, 16 - mtctr T8 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2_2O 32,16 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x2_SUB2_8 - bl CGEMM_1x2_L16_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x2_SUB2_4 - bl CGEMM_1x2_L8_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 32,16, 0,0 - KERNEL1x2_E2 32,16, 1,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 32,16, 0,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x2_SAVE - KERNEL1x2 - - MY_ALIGN -CGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -CGEMM_L1x2_END: -/*----------------------------------------*/ - - -CGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x1 - ble CGEMM_L1x1_SUB0 - bl CGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x1_SAVE - b CGEMM_L1x1_SUB2 - - -CGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-8 - addi AO,AO,-8 - LOAD1x1O 8,8 - END1x1_WITHOUT_ADD - LOAD1x1_2O 16, 16 - mtctr T8 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1_2O 16,16 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x1_SUB2_8 - bl CGEMM_1x1_L16_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x1_SUB2_4 - bl CGEMM_1x1_L8_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 16,16, 0,0 - KERNEL1x1_E2 16,16, 1,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 16,16, 0,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x1_SAVE - KERNEL1x1 - - MY_ALIGN -CGEMM_L1x1_SAVE: -/*----------------------------------------*/ - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -CGEMM_L1x1_END: -/*----------------------------------------*/ - slwi T1, K, 3 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - -CGEMM_L1_END: - - - - +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S index a256e1a01f..be2b74f013 100644 --- a/kernel/power/cgemm_macros_power9.S +++ b/kernel/power/cgemm_macros_power9.S @@ -1,3019 +1,3019 @@ - -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - - -.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - -/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmulsp \VSOUT1,\VSINII, alpha_i - xvmulsp \VSOUT2,\VSINRR, alpha_i -.endm - -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubasp \VSOUT1,\VSINRR, alpha_r - xvmaddasp \VSOUT2,\VSINII, alpha_r -.endm - -/* macros for N=4 and M=8 -**********************************************************************************************/ - -.macro Zero4x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD4x8 - LOAD4x8O 0,0 -.endm - - -.macro LOAD4x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_NORMAL - END4x8 AO,BO,64,32 -.endm - - -.macro END4x8_WITHOUT_ADD - END4x8 AO,BO,0,0 -.endm - - -.macro END4x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.endm - - -.macro LOAD4x8_2 - LOAD4x8_2O 0,0 -.endm - - -.macro LOAD4x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_2 - /*for load2 offset will be 128 and 64*/ - KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 -.endm - - -.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL4x8 - LOAD4x8 - END4x8 AO, BO, 64,32 -.endm - - -.macro SAVE4x8 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - xxperm vs2,vs50,permute_mask - xxperm vs6,vs58,permute_mask - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - xxperm vs3,vs51,permute_mask - xxperm vs7,vs59,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 - xxperm vs10,vs54,permute_mask - xxperm vs14,vs62,permute_mask - AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 - xxperm vs11,vs55,permute_mask - xxperm vs15,vs63,permute_mask - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - #ifndef TRMMKERNEL - lxv vs32 , 0(T2) - lxv vs40 , 16(T2) -#endif - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs33 , 32(T2) - lxv vs41 , 48(T2) -#endif - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 -#ifndef TRMMKERNEL - lxv vs34 , 0(T3) - lxv vs42 , 16(T3) -#endif - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs35 , 32(T3) - lxv vs43 , 48(T3) -#endif - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - MULT_APLHA_PART1 vs48,vs56,vs0,vs1 - MULT_APLHA_PART1 vs49,vs57,vs2,vs3 - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - MULT_APLHA_PART1 vs50,vs58,vs4,vs5 - MULT_APLHA_PART1 vs51,vs59,vs6,vs7 - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - MULT_APLHA_PART2 vs48,vs56,vs0,vs1 - MULT_APLHA_PART2 vs49,vs57,vs2,vs3 - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - MULT_APLHA_PART2 vs50,vs58,vs4,vs5 - MULT_APLHA_PART2 vs51,vs59,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs8,vs9 - MULT_APLHA_PART1 vs53,vs61,vs10,vs11 - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - MULT_APLHA_PART1 vs54,vs62,vs12,vs13 - MULT_APLHA_PART1 vs55,vs63,vs14,vs15 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - MULT_APLHA_PART2 vs52,vs60,vs8,vs9 - MULT_APLHA_PART2 vs53,vs61,vs10,vs11 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - MULT_APLHA_PART2 vs54,vs62,vs12,vs13 - MULT_APLHA_PART2 vs55,vs63,vs14,vs15 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs32,vs32,vs1 - xvaddsp vs40,vs40,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs33,vs33,vs5 - xvaddsp vs41,vs41,vs7 - xvaddsp vs34,vs34,vs9 - xvaddsp vs42,vs42,vs11 - xvaddsp vs35,vs35,vs13 - xvaddsp vs43,vs43,vs15 -#else - xxpermdi vs32,vs8,vs0,2 - xxpermdi vs40,vs10,vs2,2 - xxpermdi vs33,vs12,vs4,2 - xxpermdi vs41,vs14,vs6,2 - xxpermdi vs34,vs0,vs8,2 - xxpermdi vs42,vs2,vs10,2 - xxpermdi vs35,vs4,vs12,2 - xxpermdi vs43,vs6,vs14,2 -#endif - stxv vs32 , 0(T2) - stxv vs40 , 16(T2) - stxv vs33 , 32(T2) - stxv vs41 , 48(T2) - stxv vs34 , 0(T3) - stxv vs42 , 16(T3) - stxv vs35 , 32(T3) - stxv vs43 , 48(T3) - addi CO, CO, 64 -.endm - -/* macros for N=4 and M=4 -**********************************************************************************************/ - -.macro Zero4x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endm - - -.macro LOAD4x4 - LOAD4x4O 0,0 -.endm - - -.macro LOAD4x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_NORMAL - END4x4 AO,BO,32,32 -.endm - - -.macro END4x4_WITHOUT_ADD - END4x4 AO,BO,0,0 -.endm - - -.macro END4x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.endm - - -.macro LOAD4x4_2 - LOAD4x4_2O 0,0 -.endm - - -.macro LOAD4x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_2 - /*for load2 offset will be 64 and 64*/ - KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 -.endm - - -.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x4 - LOAD4x4 - END4x4 AO, BO, 32,32 -.endm - - -.macro SAVE4x4 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - #ifndef TRMMKERNEL - lxv vs28 , 0(T2) - lxv vs29 , 16(T2) -#endif -#ifndef TRMMKERNEL - lxv vs30 , 0(T3) - lxv vs31 , 16(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs48,vs56,vs4,vs5 - MULT_APLHA_PART1 vs49,vs57,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs12,vs13 - MULT_APLHA_PART1 vs53,vs61,vs14,vs15 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs48,vs56,vs4,vs5 - MULT_APLHA_PART2 vs49,vs57,vs6,vs7 - MULT_APLHA_PART2 vs52,vs60,vs12,vs13 - MULT_APLHA_PART2 vs53,vs61,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 - xvaddsp vs28,vs28,vs5 - xvaddsp vs29,vs29,vs7 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 - xxpermdi vs28,vs12,vs4,2 - xxpermdi vs29,vs14,vs6,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - stxv vs28 , 0(T2) - stxv vs29 , 16(T2) - stxv vs30 , 0(T3) - stxv vs31 , 16(T3) - addi CO, CO, 32 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD4x2 - LOAD4x2O 0,0 -.endm - - -.macro LOAD4x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_NORMAL - END4x2 AO,BO,16,32 -.endm - - -.macro END4x2_WITHOUT_ADD - END4x2 AO,BO,0,0 -.endm - - -.macro END4x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD4x2_2 - LOAD4x2_2O 0,0 -.endm - - -.macro LOAD4x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_2 - /*for load2 offset will be 32 and 64*/ - KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 -.endm - - -.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x2 - LOAD4x2 - END4x2 AO, BO, 16,32 -.endm - - -.macro SAVE4x2 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs25 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxv vs27 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs10,vs2,0 - xxpermdi vs3,vs0,vs8,3 - xxpermdi vs11,vs2,vs10,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 - xvaddsp vs25,vs25,vs3 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs10,vs2,0 - xxpermdi vs25,vs0,vs8,3 - xxpermdi vs27,vs2,vs10,3 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 0(T1) - stxv vs26 , 0(T2) - stxv vs27 , 0(T3) - addi CO, CO, 16 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD4x1 - LOAD4x1O 0,0 -.endm - - -.macro LOAD4x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END4x1_NORMAL - END4x1 AO,BO,8,32 -.endm - - -.macro END4x1_WITHOUT_ADD - END4x1 AO,BO,0,0 -.endm - - -.macro END4x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD4x1_2 - LOAD4x1_2O 0,0 -.endm - - -.macro LOAD4x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) -.endm - - -.macro END4x1_2 - /*for load2 offset will be 16 and 64*/ - KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 -.endm - - -.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x1 - LOAD4x1 - END4x1 AO, BO, 8,32 -.endm - - -.macro SAVE4x1 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxsd v6 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxsd v7 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - xxspltd vs9,vs2,0 - xxspltd vs11,vs2,1 - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 - xvaddsp vs38,vs38,vs9 - xvaddsp vs39,vs39,vs11 -#else - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 - xxspltd vs38,vs2,0 - xxspltd vs39,vs2,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - stxsd v6 , 0(T2) - stxsd v7 , 0(T3) - addi CO, CO, 8 -.endm - -/* macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,64,16 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_2 - /*for load2 offset will be 128 and 32*/ - KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 -.endm - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 64,16 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - addi CO, CO, 64 -.endm - -/* macros for N=2 and M=4 -**********************************************************************************************/ - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,32,16 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_2 - /*for load2 offset will be 64 and 32*/ - KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 -.endm - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 32,16 -.endm - - -.macro SAVE2x4 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - addi CO, CO, 32 -.endm - -/* macros for N=2 and M=2 -**********************************************************************************************/ - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs36, vs36, vs36 - xxlxor vs40, vs40, vs40 - xxlxor vs44, vs44, vs44 -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,16,16 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_2 - /*for load2 offset will be 32 and 32*/ - KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 -.endm - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs44, vs0,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 16,16 -.endm - - -.macro SAVE2x2 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs8,vs9, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs0,vs8,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs0,vs8,3 -#endif - stxv vs24 , 0(CO) - stxv vs26 , 0(T1) - addi CO, CO, 16 -.endm - -/* macros for N=2 and M=1 -**********************************************************************************************/ - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,8,16 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_2 - /*for load2 offset will be 16 and 32*/ - KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 -.endm - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 8,16 -.endm - - -.macro SAVE2x1 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - /*--v4==vs36 v5==vs37---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 -#else - /*--v4==vs36 v5==vs37---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - addi CO, CO, 8 -.endm - -/* macros for N=1 and M=8 -**********************************************************************************************/ - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,64,8 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_2 - /*for load2 offset will be 128 and 16*/ - KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 -.endm - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 64,8 -.endm - - -.macro SAVE1x8 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 - xxperm vs4,vs5, vs28 - xxperm vs6,vs7, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - xvaddsp vs26,vs26,vs4 - xvaddsp vs27,vs27,vs6 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) - stxv vs4 , 32(CO) - stxv vs6 , 48(CO) -#endif - addi CO, CO, 64 -.endm - -/* macros for N=1 and M=4 -**********************************************************************************************/ - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,32,8 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_2 - /*for load2 offset will be 64 and 16*/ - KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 -.endm - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 32,8 -.endm - - -.macro SAVE1x4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) -#endif - addi CO, CO, 32 -.endm - -/* macros for N=1 and M=2 -**********************************************************************************************/ - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,16,8 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs0, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_2 - /*for load2 offset will be 32 and 16*/ - KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 -.endm - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP4(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 16,8 -.endm - - -.macro SAVE1x2 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - stxv vs24 , 0(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) -#endif - addi CO, CO, 16 -.endm - -/* macros for N=1 and M=1 -**********************************************************************************************/ -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxsd v4, (\OffsetB+0)(BO) - lxsd v5, (\OffsetA+0)(AO) - xxperm vs38, vs36, permute_mask -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,8,8 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs37,vs36 - xvmaddasp vs40, vs37,vs38 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask -.endm - - -.macro END1x1_2 - /*for load2 offset will be 16 and 16*/ - KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 -.endm - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP2(\Index,\OffsetB)(\BREG) - lxv vs4, DISP2(\Index,\OffsetB)(\AREG) - xxperm vs10, vs8, permute_mask -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP2(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP2(\Index,16) -.endif - -.endif -.endm - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 8,8 -.endm - - -.macro SAVE1x1 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - /*aggregate x2*/ - xxpermdi vs33,vs32,vs32,2 - xxpermdi vs41,vs40,vs40,2 - xvaddsp vs32,vs32,vs33 - xvaddsp vs40,vs40,vs41 - - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs37,vs1 - MULT_APLHA_PART2 vs32,vs40,vs37,vs1 - -/* reconstruct r,i pairs*/ - xxperm vs37,vs1, vs28 - -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs36,vs36,vs37 - stxsd v4 , 0(CO) -#else - -/* vs37 is v5 */ - stxsd v5 , 0(CO) -#endif - addi CO, CO, 8 -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*8; -// ptrbb = bb + off*4; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+8; // number of values in A -// #else -// temp = off+4; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 8; // number of values in A -// #else -// temp -= 4; // number of values in B -// #endif -// ptrba += temp*8; -// ptrbb += temp*4; -// #endif - -// #ifdef LEFT -// off += 8; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif + +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@gmail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmulsp \VSOUT1,\VSINII, alpha_i + xvmulsp \VSOUT2,\VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubasp \VSOUT1,\VSINRR, alpha_r + xvmaddasp \VSOUT2,\VSINII, alpha_r +.endm + +/* macros for N=4 and M=8 +**********************************************************************************************/ + +.macro Zero4x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD4x8 + LOAD4x8O 0,0 +.endm + + +.macro LOAD4x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_NORMAL + END4x8 AO,BO,64,32 +.endm + + +.macro END4x8_WITHOUT_ADD + END4x8 AO,BO,0,0 +.endm + + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.endm + + +.macro LOAD4x8_2 + LOAD4x8_2O 0,0 +.endm + + +.macro LOAD4x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 +.endm + + +.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64,32 +.endm + + +.macro SAVE4x8 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + xxperm vs2,vs50,permute_mask + xxperm vs6,vs58,permute_mask + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + xxperm vs3,vs51,permute_mask + xxperm vs7,vs59,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 + xxperm vs10,vs54,permute_mask + xxperm vs14,vs62,permute_mask + AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 + xxperm vs11,vs55,permute_mask + xxperm vs15,vs63,permute_mask + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + #ifndef TRMMKERNEL + lxv vs32 , 0(T2) + lxv vs40 , 16(T2) +#endif + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs33 , 32(T2) + lxv vs41 , 48(T2) +#endif + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 +#ifndef TRMMKERNEL + lxv vs34 , 0(T3) + lxv vs42 , 16(T3) +#endif + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +#ifndef TRMMKERNEL + lxv vs35 , 32(T3) + lxv vs43 , 48(T3) +#endif + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + MULT_APLHA_PART1 vs48,vs56,vs0,vs1 + MULT_APLHA_PART1 vs49,vs57,vs2,vs3 + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + MULT_APLHA_PART1 vs50,vs58,vs4,vs5 + MULT_APLHA_PART1 vs51,vs59,vs6,vs7 + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + MULT_APLHA_PART2 vs48,vs56,vs0,vs1 + MULT_APLHA_PART2 vs49,vs57,vs2,vs3 + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + MULT_APLHA_PART2 vs50,vs58,vs4,vs5 + MULT_APLHA_PART2 vs51,vs59,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs8,vs9 + MULT_APLHA_PART1 vs53,vs61,vs10,vs11 + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + MULT_APLHA_PART1 vs54,vs62,vs12,vs13 + MULT_APLHA_PART1 vs55,vs63,vs14,vs15 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + MULT_APLHA_PART2 vs52,vs60,vs8,vs9 + MULT_APLHA_PART2 vs53,vs61,vs10,vs11 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + MULT_APLHA_PART2 vs54,vs62,vs12,vs13 + MULT_APLHA_PART2 vs55,vs63,vs14,vs15 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs32,vs32,vs1 + xvaddsp vs40,vs40,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs33,vs33,vs5 + xvaddsp vs41,vs41,vs7 + xvaddsp vs34,vs34,vs9 + xvaddsp vs42,vs42,vs11 + xvaddsp vs35,vs35,vs13 + xvaddsp vs43,vs43,vs15 +#else + xxpermdi vs32,vs8,vs0,2 + xxpermdi vs40,vs10,vs2,2 + xxpermdi vs33,vs12,vs4,2 + xxpermdi vs41,vs14,vs6,2 + xxpermdi vs34,vs0,vs8,2 + xxpermdi vs42,vs2,vs10,2 + xxpermdi vs35,vs4,vs12,2 + xxpermdi vs43,vs6,vs14,2 +#endif + stxv vs32 , 0(T2) + stxv vs40 , 16(T2) + stxv vs33 , 32(T2) + stxv vs41 , 48(T2) + stxv vs34 , 0(T3) + stxv vs42 , 16(T3) + stxv vs35 , 32(T3) + stxv vs43 , 48(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro Zero4x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endm + + +.macro LOAD4x4 + LOAD4x4O 0,0 +.endm + + +.macro LOAD4x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_NORMAL + END4x4 AO,BO,32,32 +.endm + + +.macro END4x4_WITHOUT_ADD + END4x4 AO,BO,0,0 +.endm + + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.endm + + +.macro LOAD4x4_2 + LOAD4x4_2O 0,0 +.endm + + +.macro LOAD4x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs12, (16+\OffsetB)(BO) + lxv vs24, (32+\OffsetB)(BO) + lxv vs28, (32+16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 +.endm + + +.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 +.if \Complete==0 + lxv vs8, DISP8(\Index,\OffsetB)(\BREG) + lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index,64) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32,32 +.endm + + +.macro SAVE4x4 + add T4, LDC,LDC + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + #ifndef TRMMKERNEL + lxv vs28 , 0(T2) + lxv vs29 , 16(T2) +#endif +#ifndef TRMMKERNEL + lxv vs30 , 0(T3) + lxv vs31 , 16(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + xxperm vs0,vs48,permute_mask + xxperm vs4,vs56,permute_mask + xxperm vs1,vs49,permute_mask + xxperm vs5,vs57,permute_mask + xxperm vs8,vs52,permute_mask + xxperm vs12,vs60,permute_mask + xxperm vs9,vs53,permute_mask + xxperm vs13,vs61,permute_mask + AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 + AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 + AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 + AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs48,vs56,vs4,vs5 + MULT_APLHA_PART1 vs49,vs57,vs6,vs7 + MULT_APLHA_PART1 vs52,vs60,vs12,vs13 + MULT_APLHA_PART1 vs53,vs61,vs14,vs15 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs48,vs56,vs4,vs5 + MULT_APLHA_PART2 vs49,vs57,vs6,vs7 + MULT_APLHA_PART2 vs52,vs60,vs12,vs13 + MULT_APLHA_PART2 vs53,vs61,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 + xvaddsp vs28,vs28,vs5 + xvaddsp vs29,vs29,vs7 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 + xxpermdi vs28,vs12,vs4,2 + xxpermdi vs29,vs14,vs6,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + stxv vs28 , 0(T2) + stxv vs29 , 16(T2) + stxv vs30 , 0(T3) + stxv vs31 , 16(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD4x2 + LOAD4x2O 0,0 +.endm + + +.macro LOAD4x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_NORMAL + END4x2 AO,BO,16,32 +.endm + + +.macro END4x2_WITHOUT_ADD + END4x2 AO,BO,0,0 +.endm + + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD4x2_2 + LOAD4x2_2O 0,0 +.endm + + +.macro LOAD4x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16,32 +.endm + + +.macro SAVE4x2 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs10,vs2,0 + xxpermdi vs3,vs0,vs8,3 + xxpermdi vs11,vs2,vs10,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 + xvaddsp vs25,vs25,vs3 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs10,vs2,0 + xxpermdi vs25,vs0,vs8,3 + xxpermdi vs27,vs2,vs10,3 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 0(T1) + stxv vs26 , 0(T2) + stxv vs27 , 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro Zero4x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD4x1 + LOAD4x1O 0,0 +.endm + + +.macro LOAD4x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + lxv vs1, (\OffsetB+16)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END4x1_NORMAL + END4x1 AO,BO,8,32 +.endm + + +.macro END4x1_WITHOUT_ADD + END4x1 AO,BO,0,0 +.endm + + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD4x1_2 + LOAD4x1_2O 0,0 +.endm + + +.macro LOAD4x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs4, (0+\OffsetB)(BO) + lxv vs5, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs0, (32+\OffsetB)(BO) + lxv vs1, (32+16+\OffsetB)(BO) +.endm + + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 +.endm + + +.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) + lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) + lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP8(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8,32 +.endm + + +.macro SAVE4x1 + add T4, LDC,LDC + add T1, CO ,LDC + add T2,CO,T4 + add T3,T1,T4 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6 , 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7 , 0(T3) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + xxspltd vs9,vs2,0 + xxspltd vs11,vs2,1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 + xvaddsp vs38,vs38,vs9 + xvaddsp vs39,vs39,vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 + xxspltd vs38,vs2,0 + xxspltd vs39,vs2,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + stxsd v6 , 0(T2) + stxsd v7 , 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,64,16 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 +.endm + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64,16 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask +#ifndef TRMMKERNEL + lxv vs28 , 0(T1) + lxv vs29 , 16(T1) +#endif + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask +#ifndef TRMMKERNEL + lxv vs30 , 32(T1) + lxv vs31 , 48(T1) +#endif + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + add T2,CO,T4 + add T3,T1,T4 + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + xxperm vs10,vs38,permute_mask + xxperm vs14,vs46,permute_mask + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + xxperm vs11,vs39,permute_mask + xxperm vs15,vs47,permute_mask + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 + AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART1 vs38,vs46,vs12,vs13 + MULT_APLHA_PART1 vs39,vs47,vs14,vs15 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs38,vs46,vs12,vs13 + MULT_APLHA_PART2 vs39,vs47,vs14,vs15 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs4,vs5, save_permute_1 + xxperm vs6,vs7, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 + xxperm vs12,vs13, save_permute_1 + xxperm vs14,vs15, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs5,vs12,vs4,2 + xxpermdi vs7,vs14,vs6,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xxpermdi vs13,vs4,vs12,2 + xxpermdi vs15,vs6,vs14,2 + xvaddsp vs26,vs26,vs5 + xvaddsp vs27,vs27,vs7 + xvaddsp vs28,vs28,vs9 + xvaddsp vs29,vs29,vs11 + xvaddsp vs30,vs30,vs13 + xvaddsp vs31,vs31,vs15 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs12,vs4,2 + xxpermdi vs27,vs14,vs6,2 + xxpermdi vs28,vs0,vs8,2 + xxpermdi vs29,vs2,vs10,2 + xxpermdi vs30,vs4,vs12,2 + xxpermdi vs31,vs6,vs14,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) + stxv vs28 , 0(T1) + stxv vs29 , 16(T1) + stxv vs30 , 32(T1) + stxv vs31 , 48(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs24, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,32,16 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs24, (16+\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs25, vs24, vs24,2 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 +.endm + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index,32) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32,16 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) + lxv vs27 , 16(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + xxperm vs9,vs37,permute_mask + xxperm vs13,vs45,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 + AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART1 vs37,vs45,vs10,vs11 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs37,vs45,vs10,vs11 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs2,vs3, save_permute_1 + xxperm vs8,vs9, save_permute_1 + xxperm vs10,vs11, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,2 + xxpermdi vs3,vs10,vs2,2 + xxpermdi vs9,vs0,vs8,2 + xxpermdi vs11,vs2,vs10,2 + xvaddsp vs24,vs24,vs1 + xvaddsp vs25,vs25,vs3 + xvaddsp vs26,vs26,vs9 + xvaddsp vs27,vs27,vs11 +#else + xxpermdi vs24,vs8,vs0,2 + xxpermdi vs25,vs10,vs2,2 + xxpermdi vs26,vs0,vs8,2 + xxpermdi vs27,vs2,vs10,2 +#endif + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 0(T1) + stxv vs27 , 16(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs36, vs36, vs36 + xxlxor vs40, vs40, vs40 + xxlxor vs44, vs44, vs44 +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs24, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,16,16 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs44, vs0,vs27 +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs8, (\OffsetA)(AO) + lxv vs24, (16+\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs27, vs26, vs26,2 +.endm + + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 +.endm + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP4(\Index,\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs44, vs4,vs11 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs44, vs0,vs27 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,32) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16,16 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs8,vs36,permute_mask + xxperm vs12,vs44,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs36,vs44,vs8,vs9 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs36,vs44,vs8,vs9 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 + xxperm vs8,vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1,vs8,vs0,0 + xxpermdi vs9,vs0,vs8,3 + xvaddsp vs24,vs24,vs1 + xvaddsp vs26,vs26,vs9 +#else + xxpermdi vs24,vs8,vs0,0 + xxpermdi vs26,vs0,vs8,3 +#endif + stxv vs24 , 0(CO) + stxv vs26 , 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,8,16 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs27, (\OffsetA)(AO) + lxv vs4, (0+\OffsetB)(BO) + lxv vs0, (16+\OffsetB)(BO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetA)(\AREG) + xxspltd vs8,vs27,1 +.endif +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) +.endif + +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,16) + addi \BREG, \BREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8,16 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1,vs0,0 + xxspltd vs3,vs0,1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36,vs36,vs1 + xvaddsp vs37,vs37,vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36,vs0,0 + xxspltd vs37,vs0,1 +#endif + stxsd v4 , 0(CO) + stxsd v5 , 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,64,8 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs6, (32+\OffsetA)(AO) + lxv vs7, (48+\OffsetA)(AO) + lxv vs0, (64+\OffsetA)(AO) + lxv vs1, (64+16+\OffsetA)(AO) + lxv vs2, (64+32+\OffsetA)(AO) + lxv vs3, (64+48+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 +.endm + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 +.if \Complete==0 + lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) +.endif +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP16(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP16(\Index,128) +.endif + +.endif +.endm + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + + +.macro SAVE1x8 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask +#ifndef TRMMKERNEL + lxv vs26 , 32(CO) + lxv vs27 , 48(CO) +#endif + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + xxperm vs2,vs34,permute_mask + xxperm vs6,vs42,permute_mask + xxperm vs3,vs35,permute_mask + xxperm vs7,vs43,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 + AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART1 vs34,vs42,vs4,vs5 + MULT_APLHA_PART1 vs35,vs43,vs6,vs7 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs34,vs42,vs4,vs5 + MULT_APLHA_PART2 vs35,vs43,vs6,vs7 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 + xxperm vs4,vs5, vs28 + xxperm vs6,vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + xvaddsp vs26,vs26,vs4 + xvaddsp vs27,vs27,vs6 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) + stxv vs26 , 32(CO) + stxv vs27 , 48(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) + stxv vs4 , 32(CO) + stxv vs6 , 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,32,8 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs5, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + lxv vs0, (32+\OffsetA)(AO) + lxv vs1, (32+16+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 +.endm + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 +.if \Complete==0 + lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 +.if \Complete==0 + lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP8(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP8(\Index,64) +.endif + +.endif +.endm + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + + +.macro SAVE1x4 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) + lxv vs25 , 16(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + xxperm vs1,vs33,permute_mask + xxperm vs5,vs41,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART1 vs33,vs41,vs2,vs3 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs33,vs41,vs2,vs3 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 + xxperm vs2,vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + xvaddsp vs25,vs25,vs2 + stxv vs24 , 0(CO) + stxv vs25 , 16(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) + stxv vs2 , 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24,vs36,0 + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,16,8 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs27, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + lxv vs0, (16+\OffsetA)(AO) + xxspltd vs8,vs27,1 + xxspltd vs24,vs27,0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 +.endm + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete +.if \Complete==0 + lxv vs27, DISP2(\Index,\OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs8,vs27,1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs40, vs0,vs26 +.if \Complete==0 + lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24,vs27,0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP4(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP4(\Index,32) +.endif + +.endif +.endm + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24 , 0(CO) +#endif + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs0,vs1 + MULT_APLHA_PART2 vs32,vs40,vs0,vs1 +/* reconstruct r,i pairs*/ + xxperm vs0,vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24,vs24,vs0 + stxv vs24 , 0(CO) +#else +/* reconstruct r,i pairs*/ + stxv vs0 , 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,8,8 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + + xvmaddasp vs32, vs37,vs36 + xvmaddasp vs40, vs37,vs38 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 +.endm + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs40, vs4,vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index,\OffsetB)(\BREG) + lxv vs4, DISP2(\Index,\OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index,\OffsetB) + addi \AREG, \AREG, DISP2(\Index,\OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index,16) + addi \AREG, \AREG, DISP2(\Index,16) +.endif + +.endif +.endm + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33,vs32,vs32,2 + xxpermdi vs41,vs40,vs40,2 + xvaddsp vs32,vs32,vs33 + xvaddsp vs40,vs40,vs41 + + xxperm vs0,vs32,permute_mask + xxperm vs4,vs40,permute_mask + AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1,2 + /*VSINRR,VSINII,VSOUT1,VSOUT2*/ + MULT_APLHA_PART1 vs32,vs40,vs37,vs1 + MULT_APLHA_PART2 vs32,vs40,vs37,vs1 + +/* reconstruct r,i pairs*/ + xxperm vs37,vs1, vs28 + +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36,vs36,vs37 + stxsd v4 , 0(CO) +#else + +/* vs37 is v5 */ + stxsd v5 , 0(CO) +#endif + addi CO, CO, 8 +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index 8663039c57..575847da2e 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -1,597 +1,597 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/zgemv_n.c" -#else - -#include -#include -#include "common.h" -#include -#define NBMAX 1024 - - -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; - register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; - register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; - register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; - register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; - register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; - register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; - register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; - register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - register __vector float *vptr_a2 = (__vector float *) a2; - register __vector float *vptr_a3 = (__vector float *) a3; - BLASLONG i = 0; - BLASLONG i2=16; - for (;i< n * 8; i+=32,i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; - va0 = vec_perm(va0, va0,swap_mask); - va0_1 = vec_perm(va0_1, va0_1,swap_mask); - va1 = vec_perm(va1, va1,swap_mask); - va1_1 = vec_perm(va1_1, va1_1,swap_mask); - va2 = vec_perm(va2, va2,swap_mask); - va2_1 = vec_perm(va2_1, va2_1,swap_mask); - va3 = vec_perm(va3, va3,swap_mask); - va3_1 = vec_perm(va3_1, va3_1,swap_mask); - vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; - vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - register __vector float va1x = vec_perm(va1, va1,swap_mask); - register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); - vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } - -} - - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; -#endif - register __vector float *vptr_y = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) ap; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vy_0 = vec_vsx_ld(i,vptr_y); - register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - vy_0 += va0*vx0_r + va0x*vx0_i; - vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - - vec_vsx_st(vy_0 ,i, vptr_y); - vec_vsx_st(vy_1,i2,vptr_y); - } -} - - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i=0; - - - if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i +#include +#include "common.h" +#include +#define NBMAX 1024 + + +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; + register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; + register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; + register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; + register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; + register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; + register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; + register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; + register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + register __vector float *vptr_a2 = (__vector float *) a2; + register __vector float *vptr_a3 = (__vector float *) a3; + BLASLONG i = 0; + BLASLONG i2=16; + for (;i< n * 8; i+=32,i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; + va0 = vec_perm(va0, va0,swap_mask); + va0_1 = vec_perm(va0_1, va0_1,swap_mask); + va1 = vec_perm(va1, va1,swap_mask); + va1_1 = vec_perm(va1_1, va1_1,swap_mask); + va2 = vec_perm(va2, va2,swap_mask); + va2_1 = vec_perm(va2_1, va2_1,swap_mask); + va3 = vec_perm(va3, va3,swap_mask); + va3_1 = vec_perm(va3_1, va3_1,swap_mask); + vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; + vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; + register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; + register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; + register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; + register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) a0; + register __vector float *vptr_a1 = (__vector float *) a1; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + register __vector float va1x = vec_perm(va1, va1,swap_mask); + register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); + vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; + vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } + +} + + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; + register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; +#else + register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; + register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; +#endif + register __vector float *vptr_y = (__vector float *) y; + register __vector float *vptr_a0 = (__vector float *) ap; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vy_0 = vec_vsx_ld(i,vptr_y); + register __vector float vy_1 = vec_vsx_ld(i2,vptr_y); + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + register __vector float va0x = vec_perm(va0, va0,swap_mask); + register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + + vec_vsx_st(vy_0 ,i, vptr_y); + vec_vsx_st(vy_1,i2,vptr_y); + } +} + + + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i=0; + + + if (inc_dest != 2) { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* vptr_a2 = (__vector float*) a2; - __vector float* vptr_a3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va2 = vec_vsx_ld(i ,vptr_a2); - register __vector float va3 = vec_vsx_ld(i ,vptr_a3); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); - register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - vtemp2_p += vx_0*va2 + vx_1*va2_1; - vtemp2_r += vxr_0*va2 + vxr_1*va2_1; - vtemp3_p += vx_0*va3 + vx_1*va3_1; - vtemp3_r += vxr_0*va3 + vxr_1*va3_1; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - -#endif - -} - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - - - __vector float* vptr_a0 = (__vector float*) a0; - __vector float* vptr_a1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va1 = vec_vsx_ld(i, vptr_a1); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); - - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - vtemp1_p += vx_0*va1 + vx_1*va1_1; - vtemp1_r += vxr_0*va1 + vxr_1*va1_1; - - } -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - -#endif - -} - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - __vector float* vptr_a0 = (__vector float*) ap; - __vector float* v_x = (__vector float*) x; - BLASLONG i = 0; - BLASLONG i2 = 16; - for (;i< n * 8; i+=32, i2+=32) { - register __vector float vx_0 = vec_vsx_ld( i,v_x) ; - register __vector float vx_1 = vec_vsx_ld(i2, v_x); - - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - register __vector float va0 = vec_vsx_ld(i,vptr_a0); - register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); - - vtemp0_p += vx_0*va0 + vx_1*va0_1 ; - vtemp0_r += vxr_0*va0 + vxr_1*va0_1; - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - -#endif - - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i=0; - BLASLONG j=0; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return (0); - } - - if (m3 == 1) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); - -} -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else + +#include "common.h" + +#define NBMAX 1024 +#include +static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; + +static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* vptr_a2 = (__vector float*) a2; + __vector float* vptr_a3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va2 = vec_vsx_ld(i ,vptr_a2); + register __vector float va3 = vec_vsx_ld(i ,vptr_a3); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2); + register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + vtemp2_p += vx_0*va2 + vx_1*va2_1; + vtemp2_r += vxr_0*va2 + vxr_1*va2_1; + vtemp3_p += vx_0*va3 + vx_1*va3_1; + vtemp3_r += vxr_0*va3 + vxr_1*va3_1; + + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + +#endif + +} + + +static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; + + + __vector float* vptr_a0 = (__vector float*) a0; + __vector float* vptr_a1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va1 = vec_vsx_ld(i, vptr_a1); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); + + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + vtemp1_p += vx_0*va1 + vx_1*va1_1; + vtemp1_r += vxr_0*va1 + vxr_1*va1_1; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; + + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + +#endif + +} + + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + + __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); + //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) + register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; + register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; + __vector float* vptr_a0 = (__vector float*) ap; + __vector float* v_x = (__vector float*) x; + BLASLONG i = 0; + BLASLONG i2 = 16; + for (;i< n * 8; i+=32, i2+=32) { + register __vector float vx_0 = vec_vsx_ld( i,v_x) ; + register __vector float vx_1 = vec_vsx_ld(i2, v_x); + + register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); + register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); + + register __vector float va0 = vec_vsx_ld(i,vptr_a0); + register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); + + vtemp0_p += vx_0*va0 + vx_1*va0_1 ; + vtemp0_r += vxr_0*va0 + vxr_1*va0_1; + } + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + +#endif + + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i=0; + BLASLONG j=0; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return (0); + } + + if (m3 == 1) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + + while (j < (n & -2)) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } + + while (j < n) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + return (0); + +} +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 84ba5d913a..dbd7e34822 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -1,233 +1,233 @@ -/*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if defined(POWER8) || defined(POWER9) || defined(POWER10) -#if defined(__VEC__) || defined(__ALTIVEC__) - -static void crot_kernel_8 (long n, float *x, float *y, float c, float s) -{ - __vector float t0; - __vector float t1; - __vector float t2; - __vector float t3; - __vector float t4; - __vector float t5; - __vector float t6; - __vector float t7; - __asm__ - ( - "xscvdpspn 36, %x[cos] \n\t" // load c to all words - "xxspltw 36, 36, 0 \n\t" - "xscvdpspn 37, %x[sin] \n\t" // load s to all words - "xxspltw 37, 37, 0 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 64 \n\t" - "addi %[y_ptr], %[y_ptr], 64 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble two%= \n\t" - ".align 5 \n\t" - "one%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "lxvd2x 32, 0, %[x_ptr] \n\t" // load x - "lxvd2x 33, %[i16], %[x_ptr] \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "lxvd2x 34, %[i32], %[x_ptr] \n\t" - "lxvd2x 35, %[i48], %[x_ptr] \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "lxvd2x 48, 0, %[y_ptr] \n\t" // load y - "lxvd2x 49, %[i16], %[y_ptr] \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "lxvd2x 50, %[i32], %[y_ptr] \n\t" - "lxvd2x 51, %[i48], %[y_ptr] \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" - "addi %[x_ptr], %[x_ptr], 128 \n\t" - "addi %[y_ptr], %[y_ptr], 128 \n\t" - "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt one%= \n\t" - "two%=: \n\t" - "xvmulsp 40, 32, 36 \n\t" // c * x - "xvmulsp 41, 33, 36 \n\t" - "xvmulsp 42, 34, 36 \n\t" - "xvmulsp 43, 35, 36 \n\t" - "xvmulsp %x[x0], 48, 36 \n\t" // c * y - "xvmulsp %x[x2], 49, 36 \n\t" - "xvmulsp %x[x1], 50, 36 \n\t" - "xvmulsp %x[x3], 51, 36 \n\t" - "xvmulsp 44, 32, 37 \n\t" // s * x - "xvmulsp 45, 33, 37 \n\t" - "xvmulsp 46, 34, 37 \n\t" - "xvmulsp 47, 35, 37 \n\t" - "xvmulsp %x[x4], 48, 37 \n\t" // s * y - "xvmulsp %x[x5], 49, 37 \n\t" - "xvmulsp %x[x6], 50, 37 \n\t" - "xvmulsp %x[x7], 51, 37 \n\t" - "addi %[x_ptr], %[x_ptr], -64 \n\t" - "addi %[y_ptr], %[y_ptr], -64 \n\t" - "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y - "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y - "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y - "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y - "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x - "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x - "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x - "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x - "stxvd2x 40, 0, %[x_ptr] \n\t" // store x - "stxvd2x 41, %[i16], %[x_ptr] \n\t" - "stxvd2x 42, %[i32], %[x_ptr] \n\t" - "stxvd2x 43, %[i48], %[x_ptr] \n\t" - "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y - "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" - "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" - "stxvd2x %x[x3], %[i48], %[y_ptr] " - : - [mem_x] "+m" (*(float (*)[2*n])x), - [mem_y] "+m" (*(float (*)[2*n])y), - [temp_n] "+r" (n), - [x_ptr] "+&b" (x), - [y_ptr] "+&b" (y), - [x0] "=wa" (t0), - [x1] "=wa" (t2), - [x2] "=wa" (t1), - [x3] "=wa" (t3), - [x4] "=wa" (t4), - [x5] "=wa" (t5), - [x6] "=wa" (t6), - [x7] "=wa" (t7) - : - [cos] "f" (c), - [sin] "f" (s), - [i16] "b" (16), - [i32] "b" (32), - [i48] "b" (48) - : - "cr0", - "vs32","vs33","vs34","vs35","vs36","vs37", - "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51" - ); -} - -#endif -#endif - - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { -#if defined(__VEC__) || defined(__ALTIVEC__) - BLASLONG n1 = n & -8; - if ( n1 > 0 ) - { - crot_kernel_8(n1, x, y, c, s); - i=n1; - ix=2*n1; - } -#endif - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; - - } - - } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - } - return(0); -} - +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + +static void crot_kernel_8 (long n, float *x, float *y, float c, float s) +{ + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + __vector float t4; + __vector float t5; + __vector float t6; + __vector float t7; + __asm__ + ( + "xscvdpspn 36, %x[cos] \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + "xscvdpspn 37, %x[sin] \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 64 \n\t" + "addi %[y_ptr], %[y_ptr], 64 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "lxvd2x 32, 0, %[x_ptr] \n\t" // load x + "lxvd2x 33, %[i16], %[x_ptr] \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "lxvd2x 34, %[i32], %[x_ptr] \n\t" + "lxvd2x 35, %[i48], %[x_ptr] \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "lxvd2x 48, 0, %[y_ptr] \n\t" // load y + "lxvd2x 49, %[i16], %[y_ptr] \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "lxvd2x 50, %[i32], %[y_ptr] \n\t" + "lxvd2x 51, %[i48], %[y_ptr] \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" + "addi %[x_ptr], %[x_ptr], 128 \n\t" + "addi %[y_ptr], %[y_ptr], 128 \n\t" + "addic. %[temp_n], %[temp_n], -8 \n\t" + "bgt one%= \n\t" + "two%=: \n\t" + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + "xvmulsp %x[x0], 48, 36 \n\t" // c * y + "xvmulsp %x[x2], 49, 36 \n\t" + "xvmulsp %x[x1], 50, 36 \n\t" + "xvmulsp %x[x3], 51, 36 \n\t" + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + "xvmulsp %x[x4], 48, 37 \n\t" // s * y + "xvmulsp %x[x5], 49, 37 \n\t" + "xvmulsp %x[x6], 50, 37 \n\t" + "xvmulsp %x[x7], 51, 37 \n\t" + "addi %[x_ptr], %[x_ptr], -64 \n\t" + "addi %[y_ptr], %[y_ptr], -64 \n\t" + "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y + "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y + "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y + "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y + "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x + "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x + "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x + "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x + "stxvd2x 40, 0, %[x_ptr] \n\t" // store x + "stxvd2x 41, %[i16], %[x_ptr] \n\t" + "stxvd2x 42, %[i32], %[x_ptr] \n\t" + "stxvd2x 43, %[i48], %[x_ptr] \n\t" + "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y + "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" + "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" + "stxvd2x %x[x3], %[i48], %[y_ptr] " + : + [mem_x] "+m" (*(float (*)[2*n])x), + [mem_y] "+m" (*(float (*)[2*n])y), + [temp_n] "+r" (n), + [x_ptr] "+&b" (x), + [y_ptr] "+&b" (y), + [x0] "=wa" (t0), + [x1] "=wa" (t2), + [x2] "=wa" (t1), + [x3] "=wa" (t3), + [x4] "=wa" (t4), + [x5] "=wa" (t5), + [x6] "=wa" (t6), + [x7] "=wa" (t7) + : + [cos] "f" (c), + [sin] "f" (s), + [i16] "b" (16), + [i32] "b" (32), + [i48] "b" (48) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} + +#endif +#endif + + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + crot_kernel_8(n1, x, y, c, s); + i=n1; + ix=2*n1; + } +#endif + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + } + return(0); +} + diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S index 2fb1b27ef8..86108f20cd 100644 --- a/kernel/power/dgemm_kernel_power9.S +++ b/kernel/power/dgemm_kernel_power9.S @@ -1,249 +1,249 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld - - - - -#define STACKSIZE (512 ) -#define ALPHA_SP (296+192)(SP) -#define FZERO (304+192)(SP) - - - -#define M r3 -#define N r4 -#define K r5 - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs18 - -#define o0 0 - - -#define T4 r12 -#define T3 r11 -#define C4 r14 -#define o8 r15 -#define o24 r16 -#define C2 r17 -#define L r18 -#define T1 r19 -#define C3 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define o16 r27 -#define o32 r28 -#define o48 r29 - -#define PRE r30 -#define T2 r31 - -#include "dgemm_macros_power9.S" - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - - stfd f1, ALPHA_SP - stw r0, FZERO - - slwi LDC, LDC, BASE_SHIFT - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - - - cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 - - - - addi T1, SP, 296+192 - - - li PRE, 384 - li o8 , 8 - li o16, 16 - li o24, 24 - li o32, 32 - li o48, 48 - - - lxvdsx alpha_r, 0, T1 - -#include "dgemm_logic_power9.S" - -.L999: - addi r3, 0, 0 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld + + + + +#define STACKSIZE (512 ) +#define ALPHA_SP (296+192)(SP) +#define FZERO (304+192)(SP) + + + +#define M r3 +#define N r4 +#define K r5 + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs18 + +#define o0 0 + + +#define T4 r12 +#define T3 r11 +#define C4 r14 +#define o8 r15 +#define o24 r16 +#define C2 r17 +#define L r18 +#define T1 r19 +#define C3 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "dgemm_macros_power9.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + + stfd f1, ALPHA_SP + stw r0, FZERO + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + + + addi T1, SP, 296+192 + + + li PRE, 384 + li o8 , 8 + li o16, 16 + li o24, 24 + li o32, 32 + li o48, 48 + + + lxvdsx alpha_r, 0, T1 + +#include "dgemm_logic_power9.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S index 251839d19e..a48bc685a7 100644 --- a/kernel/power/dgemm_logic_power9.S +++ b/kernel/power/dgemm_logic_power9.S @@ -1,1981 +1,1981 @@ -/*************************************************************************** -Copyright (c) 2013-2019 The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#define MY_ALIGN .align 3 - -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 2 - ble LDGEMM_L4_END - -LDGEMM_L4_BEGIN: - - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LDGEMM_L4x16_END - - MY_ALIGN -LDGEMM_L4x16_BEGIN: - - li L, -128 - - - SAVE4x16_REGS - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - - and T1, CO, L - and T2, C2, L - and T3, C3, L - and T4, C4, L - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - - - addi T1, T1, 128 - addi T2, T2, 128 - addi T3, T3, 128 - addi T4, T4, 128 - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 - srawi. L, T3, 5 -#else - srawi. L, K, 5 -#endif - - ble LDGEMM_L4x16_SUB0 - - - MY_ALIGN -LDGEMM_L4x16_LOOP_START: - - li T2, 512 - - - LOAD4x16_1 - ##OffsetA=128 OffsetB=32 - addi AO,AO,2176 - # addi BO,BO,32 - addic. L, L, -1 - - ble LDGEMM_L4x16_LOOP_END - - - mtctr L - - MY_ALIGN - -LDGEMM_L4x16_LOOP: - - #dcbt AO, PRE - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_2 -2048,32, 15,1 - - - bdnz LDGEMM_L4x16_LOOP - - MY_ALIGN - MY_ALIGN -LDGEMM_L4x16_LOOP_END: - - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_3 -2048,32, 15,1 - b LDGEMM_L4x16_SUB1 - - - MY_ALIGN -LDGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - KERNEL4x16 1 - - addic. L, L, -1 - ble LDGEMM_L4x16_SAVE - b LDGEMM_L4x16_SUB2 - MY_ALIGN -LDGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - ble LDGEMM_L4x16_SAVE - MY_ALIGN -LDGEMM_L4x16_SUB2: - - andi. T1,L, 16 - ble LDGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_2 128,32, 3,0 - KERNEL4x16_I1_L2_2 128,32, 4,0 - KERNEL4x16_I1_L2_2 128,32, 5,0 - KERNEL4x16_I1_L2_2 128,32, 6,0 - KERNEL4x16_I1_L2_3 128,32, 7,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_8: - andi. T1,L, 8 - ble LDGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_3 128,32, 3,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_3 128,32, 1,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 128,32, 0,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LDGEMM_L4x16_SUB2 - - MY_ALIGN -LDGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LDGEMM_L4x16_BEGIN - -LDGEMM_L4x16_END: - -LDGEMM_L4x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L4x1_END - - andi. T1, M, 8 - ble LDGEMM_L4x8_END - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 - srawi. L, T3, 4 -#else - mr BO, B - srawi. L, K, 4 -#endif - - - ble LDGEMM_L4x8_SUB0 - -LDGEMM_L4x8_LOOP_START: - - - LOAD4x8_1 - ##OffsetA=64 OffsetB=32 - - - addic. L, L, -1 - - ble LDGEMM_L4x8_LOOP_END - - mtctr L - MY_ALIGN - -LDGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_2 64,32, 7,1 - - bdnz LDGEMM_L4x8_LOOP - MY_ALIGN -LDGEMM_L4x8_LOOP_END: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_3 64,32, 7,1 - - b LDGEMM_L4x8_SUB1 - MY_ALIGN -LDGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - KERNEL4x8 1 - - addic. L, L, -1 - ble LDGEMM_L4x8_SAVE - b LDGEMM_L4x8_SUB2 - MY_ALIGN -LDGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - ble LDGEMM_L4x8_SAVE - MY_ALIGN -LDGEMM_L4x8_SUB2: - - andi. T1,L, 8 - ble LDGEMM_L4x8_SUB2_4 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_3 64,32, 3,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_3 64,32, 1,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 64,32, 0,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x8_SAVE - KERNEL4x8 0 - - MY_ALIGN -LDGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 -#endif -LDGEMM_L4x8_END: - -LDGEMM_L4x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x4_SUB4 - -LDGEMM_L4x4_LOOP_START: - - #dcbt AO, PRE - LOAD4x4_1 - KERNEL4x4_I1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -2 - ble LDGEMM_L4x4_LOOP_END - - MY_ALIGN - -LDGEMM_L4x4_LOOP: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -1 - bgt LDGEMM_L4x4_LOOP - -LDGEMM_L4x4_LOOP_END: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_E2 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB4: - - KERNEL4x4_SUBI1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x4_SAVE - b LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x4_SAVE - -LDGEMM_L4x4_SUB2: - - KERNEL4x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SAVE: - - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 -#endif -LDGEMM_L4x4_END: - -LDGEMM_L4x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x2_SUB4 - -LDGEMM_L4x2_LOOP_START: - - LOAD4x2_1 - KERNEL4x2_I1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -2 - ble LDGEMM_L4x2_LOOP_END - - MY_ALIGN - -LDGEMM_L4x2_LOOP: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -1 - bgt LDGEMM_L4x2_LOOP - -LDGEMM_L4x2_LOOP_END: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_E2 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB4: - - KERNEL4x2_SUBI1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x2_SAVE - b LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x2_SAVE - -LDGEMM_L4x2_SUB2: - - KERNEL4x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SAVE: - - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 -#endif -LDGEMM_L4x2_END: - -LDGEMM_L4x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x1_SUB4 - -LDGEMM_L4x1_LOOP_START: - - LOAD4x1_1 - KERNEL4x1_I1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -2 - ble LDGEMM_L4x1_LOOP_END - - MY_ALIGN - -LDGEMM_L4x1_LOOP: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -1 - bgt LDGEMM_L4x1_LOOP - -LDGEMM_L4x1_LOOP_END: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_E2 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB4: - - KERNEL4x1_SUBI1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x1_SAVE - b LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x1_SAVE - -LDGEMM_L4x1_SUB2: - - KERNEL4x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SAVE: - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 -#endif -LDGEMM_L4x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - addic. J, J, -1 - bgt LDGEMM_L4_BEGIN - - andi. T2, N, 3 - ble .L999 - -LDGEMM_L4_END: - - b LDGEMM_L2_BEGIN - -.L999_H1: - - b .L999 - -LDGEMM_L2_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 2 - ble LDGEMM_L2_END - mr CO, C - mr AO, A - slwi T1, LDC , 1 - add C, C, T1 - srawi. I, M, 4 - ble LDGEMM_L2x16_END - -LDGEMM_L2x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x16_SUB4 - -LDGEMM_L2x16_LOOP_START: - - #dcbt AO, PRE - LOAD2x16_1 - #dcbt AO, PRE - KERNEL2x16_I1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -2 - ble LDGEMM_L2x16_LOOP_END - - MY_ALIGN - -LDGEMM_L2x16_LOOP: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -1 - bgt LDGEMM_L2x16_LOOP - -LDGEMM_L2x16_LOOP_END: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - KERNEL2x16_E2 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB4: - - #dcbt AO, PRE - KERNEL2x16_SUBI1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x16_SAVE - b LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x16_SAVE - -LDGEMM_L2x16_SUB2: - - KERNEL2x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SAVE: - - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt LDGEMM_L2x16_BEGIN - -LDGEMM_L2x16_END: - -LDGEMM_L2x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L2x1_END - - andi. T1, M, 8 - ble LDGEMM_L2x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x8_SUB4 - -LDGEMM_L2x8_LOOP_START: - - #dcbt AO, PRE - LOAD2x8_1 - KERNEL2x8_I1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -2 - ble LDGEMM_L2x8_LOOP_END - - MY_ALIGN - -LDGEMM_L2x8_LOOP: - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -1 - bgt LDGEMM_L2x8_LOOP - -LDGEMM_L2x8_LOOP_END: - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_2 - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_E2 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB4: - - KERNEL2x8_SUBI1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x8_SAVE - b LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x8_SAVE - -LDGEMM_L2x8_SUB2: - - KERNEL2x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SAVE: - - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 -#endif -LDGEMM_L2x8_END: - -LDGEMM_L2x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x4_SUB4 - -LDGEMM_L2x4_LOOP_START: - - LOAD2x4_1 - KERNEL2x4_I1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -2 - ble LDGEMM_L2x4_LOOP_END - - MY_ALIGN - -LDGEMM_L2x4_LOOP: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -1 - bgt LDGEMM_L2x4_LOOP - -LDGEMM_L2x4_LOOP_END: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_E2 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB4: - - KERNEL2x4_SUBI1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x4_SAVE - b LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x4_SAVE - -LDGEMM_L2x4_SUB2: - - KERNEL2x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SAVE: - - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 -#endif -LDGEMM_L2x4_END: - -LDGEMM_L2x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x2_SUB4 - -LDGEMM_L2x2_LOOP_START: - - LOAD2x2_1 - KERNEL2x2_I1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -2 - ble LDGEMM_L2x2_LOOP_END - - MY_ALIGN - -LDGEMM_L2x2_LOOP: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -1 - bgt LDGEMM_L2x2_LOOP - -LDGEMM_L2x2_LOOP_END: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_E2 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB4: - - KERNEL2x2_SUBI1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x2_SAVE - b LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x2_SAVE - -LDGEMM_L2x2_SUB2: - - KERNEL2x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SAVE: - - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 -#endif -LDGEMM_L2x2_END: - -LDGEMM_L2x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x1_SUB4 - -LDGEMM_L2x1_LOOP_START: - - LOAD2x1_1 - KERNEL2x1_I1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -2 - ble LDGEMM_L2x1_LOOP_END - - MY_ALIGN - -LDGEMM_L2x1_LOOP: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -1 - bgt LDGEMM_L2x1_LOOP - -LDGEMM_L2x1_LOOP_END: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_E2 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB4: - - KERNEL2x1_SUBI1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x1_SAVE - b LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x1_SAVE - -LDGEMM_L2x1_SUB2: - - KERNEL2x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SAVE: - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 -#endif -LDGEMM_L2x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LDGEMM_L2_END: -LDGEMM_L1_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 1 - ble LDGEMM_L1_END - mr CO, C - mr AO, A - srawi. I, M, 4 - ble LDGEMM_L1x16_END - -LDGEMM_L1x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x16_SUB4 - -LDGEMM_L1x16_LOOP_START: - - #dcbt AO, PRE - LOAD1x16_1 - #dcbt AO, PRE - KERNEL1x16_I1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -2 - ble LDGEMM_L1x16_LOOP_END - - MY_ALIGN - -LDGEMM_L1x16_LOOP: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -1 - bgt LDGEMM_L1x16_LOOP - -LDGEMM_L1x16_LOOP_END: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - KERNEL1x16_E2 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB4: - - #dcbt AO, PRE - KERNEL1x16_SUBI1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x16_SAVE - b LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x16_SAVE - -LDGEMM_L1x16_SUB2: - - KERNEL1x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SAVE: - - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt LDGEMM_L1x16_BEGIN - -LDGEMM_L1x16_END: - -LDGEMM_L1x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L1x1_END - - andi. T1, M, 8 - ble LDGEMM_L1x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x8_SUB4 - -LDGEMM_L1x8_LOOP_START: - - #dcbt AO, PRE - LOAD1x8_1 - KERNEL1x8_I1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -2 - ble LDGEMM_L1x8_LOOP_END - - MY_ALIGN - -LDGEMM_L1x8_LOOP: - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -1 - bgt LDGEMM_L1x8_LOOP - -LDGEMM_L1x8_LOOP_END: - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_2 - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_E2 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB4: - - KERNEL1x8_SUBI1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x8_SAVE - b LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x8_SAVE - -LDGEMM_L1x8_SUB2: - - KERNEL1x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SAVE: - - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 -#endif -LDGEMM_L1x8_END: - -LDGEMM_L1x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x4_SUB4 - -LDGEMM_L1x4_LOOP_START: - - LOAD1x4_1 - KERNEL1x4_I1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -2 - ble LDGEMM_L1x4_LOOP_END - - MY_ALIGN - -LDGEMM_L1x4_LOOP: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -1 - bgt LDGEMM_L1x4_LOOP - -LDGEMM_L1x4_LOOP_END: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_E2 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB4: - - KERNEL1x4_SUBI1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x4_SAVE - b LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x4_SAVE - -LDGEMM_L1x4_SUB2: - - KERNEL1x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SAVE: - - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 -#endif -LDGEMM_L1x4_END: - -LDGEMM_L1x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x2_SUB4 - -LDGEMM_L1x2_LOOP_START: - - LOAD1x2_1 - KERNEL1x2_I1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -2 - ble LDGEMM_L1x2_LOOP_END - - MY_ALIGN - -LDGEMM_L1x2_LOOP: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -1 - bgt LDGEMM_L1x2_LOOP - -LDGEMM_L1x2_LOOP_END: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_E2 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB4: - - KERNEL1x2_SUBI1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x2_SAVE - b LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x2_SAVE - -LDGEMM_L1x2_SUB2: - - KERNEL1x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SAVE: - - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 -#endif -LDGEMM_L1x2_END: - -LDGEMM_L1x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x1_SUB4 - -LDGEMM_L1x1_LOOP_START: - - LOAD1x1_1 - KERNEL1x1_I1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -2 - ble LDGEMM_L1x1_LOOP_END - - MY_ALIGN - -LDGEMM_L1x1_LOOP: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -1 - bgt LDGEMM_L1x1_LOOP - -LDGEMM_L1x1_LOOP_END: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_E2 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB4: - - KERNEL1x1_SUBI1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x1_SAVE - b LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x1_SAVE - -LDGEMM_L1x1_SUB2: - - KERNEL1x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SAVE: - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 -#endif -LDGEMM_L1x1_END: -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif -LDGEMM_L1_END: +/*************************************************************************** +Copyright (c) 2013-2019 The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define MY_ALIGN .align 3 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 2 + ble LDGEMM_L4_END + +LDGEMM_L4_BEGIN: + + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LDGEMM_L4x16_END + + MY_ALIGN +LDGEMM_L4x16_BEGIN: + + li L, -128 + + + SAVE4x16_REGS + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + + and T1, CO, L + and T2, C2, L + and T3, C3, L + and T4, C4, L + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + + + addi T1, T1, 128 + addi T2, T2, 128 + addi T3, T3, 128 + addi T4, T4, 128 + + dcbt T1, r0 + dcbt T2, r0 + dcbt T3, r0 + dcbt T4, r0 + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 + srawi. L, T3, 5 +#else + srawi. L, K, 5 +#endif + + ble LDGEMM_L4x16_SUB0 + + + MY_ALIGN +LDGEMM_L4x16_LOOP_START: + + li T2, 512 + + + LOAD4x16_1 + ##OffsetA=128 OffsetB=32 + addi AO,AO,2176 + # addi BO,BO,32 + addic. L, L, -1 + + ble LDGEMM_L4x16_LOOP_END + + + mtctr L + + MY_ALIGN + +LDGEMM_L4x16_LOOP: + + #dcbt AO, PRE + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_2 -2048,32, 15,1 + + + bdnz LDGEMM_L4x16_LOOP + + MY_ALIGN + MY_ALIGN +LDGEMM_L4x16_LOOP_END: + + KERNEL4x16_I1_L2_2 -2048,32, 0,0 + KERNEL4x16_I1_L2_2 -2048,32, 1,0 + KERNEL4x16_I1_L2_2 -2048,32, 2,0 + KERNEL4x16_I1_L2_2 -2048,32, 3,0 + KERNEL4x16_I1_L2_2 -2048,32, 4,0 + KERNEL4x16_I1_L2_2 -2048,32, 5,0 + KERNEL4x16_I1_L2_2 -2048,32, 6,0 + KERNEL4x16_I1_L2_2 -2048,32, 7,0 + KERNEL4x16_I1_L2_2 -2048,32, 8,0 + KERNEL4x16_I1_L2_2 -2048,32, 9,0 + KERNEL4x16_I1_L2_2 -2048,32, 10,0 + KERNEL4x16_I1_L2_2 -2048,32, 11,0 + KERNEL4x16_I1_L2_2 -2048,32, 12,0 + KERNEL4x16_I1_L2_2 -2048,32, 13,0 + KERNEL4x16_I1_L2_2 -2048,32, 14,0 + KERNEL4x16_I1_L2_3 -2048,32, 15,1 + b LDGEMM_L4x16_SUB1 + + + MY_ALIGN +LDGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + KERNEL4x16 1 + + addic. L, L, -1 + ble LDGEMM_L4x16_SAVE + b LDGEMM_L4x16_SUB2 + MY_ALIGN +LDGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 31 +#else + andi. L, K, 31 +#endif + ble LDGEMM_L4x16_SAVE + MY_ALIGN +LDGEMM_L4x16_SUB2: + + andi. T1,L, 16 + ble LDGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_2 128,32, 3,0 + KERNEL4x16_I1_L2_2 128,32, 4,0 + KERNEL4x16_I1_L2_2 128,32, 5,0 + KERNEL4x16_I1_L2_2 128,32, 6,0 + KERNEL4x16_I1_L2_3 128,32, 7,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_8: + andi. T1,L, 8 + ble LDGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_2 128,32, 1,0 + KERNEL4x16_I1_L2_2 128,32, 2,0 + KERNEL4x16_I1_L2_3 128,32, 3,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L2_2 128,32, 0,0 + KERNEL4x16_I1_L2_3 128,32, 1,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 128,32, 0,1 + MY_ALIGN +LDGEMM_L4x16_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LDGEMM_L4x16_SUB2 + + MY_ALIGN +LDGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LDGEMM_L4x16_BEGIN + +LDGEMM_L4x16_END: + +LDGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L4x1_END + + andi. T1, M, 8 + ble LDGEMM_L4x8_END + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 + srawi. L, T3, 4 +#else + mr BO, B + srawi. L, K, 4 +#endif + + + ble LDGEMM_L4x8_SUB0 + +LDGEMM_L4x8_LOOP_START: + + + LOAD4x8_1 + ##OffsetA=64 OffsetB=32 + + + addic. L, L, -1 + + ble LDGEMM_L4x8_LOOP_END + + mtctr L + MY_ALIGN + +LDGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_2 64,32, 7,1 + + bdnz LDGEMM_L4x8_LOOP + MY_ALIGN +LDGEMM_L4x8_LOOP_END: + + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_2 64,32, 3,0 + KERNEL4x8_I1_L2_2 64,32, 4,0 + KERNEL4x8_I1_L2_2 64,32, 5,0 + KERNEL4x8_I1_L2_2 64,32, 6,0 + KERNEL4x8_I1_L2_3 64,32, 7,1 + + b LDGEMM_L4x8_SUB1 + MY_ALIGN +LDGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + KERNEL4x8 1 + + addic. L, L, -1 + ble LDGEMM_L4x8_SAVE + b LDGEMM_L4x8_SUB2 + MY_ALIGN +LDGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 15 +#else + andi. L, K, 15 +#endif + ble LDGEMM_L4x8_SAVE + MY_ALIGN +LDGEMM_L4x8_SUB2: + + andi. T1,L, 8 + ble LDGEMM_L4x8_SUB2_4 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_2 64,32, 1,0 + KERNEL4x8_I1_L2_2 64,32, 2,0 + KERNEL4x8_I1_L2_3 64,32, 3,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LDGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L2_2 64,32, 0,0 + KERNEL4x8_I1_L2_3 64,32, 1,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LDGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 64,32, 0,1 + MY_ALIGN +LDGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LDGEMM_L4x8_SAVE + KERNEL4x8 0 + + MY_ALIGN +LDGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 +#endif +LDGEMM_L4x8_END: + +LDGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x4_SUB4 + +LDGEMM_L4x4_LOOP_START: + + #dcbt AO, PRE + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -2 + ble LDGEMM_L4x4_LOOP_END + + MY_ALIGN + +LDGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + #dcbt AO, PRE + KERNEL4x4_2 + + addic. L, L, -1 + bgt LDGEMM_L4x4_LOOP + +LDGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b LDGEMM_L4x4_SUB1 + +LDGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x4_SAVE + b LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x4_SAVE + +LDGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x4_SUB2 + +LDGEMM_L4x4_SAVE: + + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 +#endif +LDGEMM_L4x4_END: + +LDGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x2_SUB4 + +LDGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble LDGEMM_L4x2_LOOP_END + + MY_ALIGN + +LDGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt LDGEMM_L4x2_LOOP + +LDGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b LDGEMM_L4x2_SUB1 + +LDGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x2_SAVE + b LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x2_SAVE + +LDGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x2_SUB2 + +LDGEMM_L4x2_SAVE: + + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 +#endif +LDGEMM_L4x2_END: + +LDGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L4x1_SUB4 + +LDGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble LDGEMM_L4x1_LOOP_END + + MY_ALIGN + +LDGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt LDGEMM_L4x1_LOOP + +LDGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b LDGEMM_L4x1_SUB1 + +LDGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L4x1_SAVE + b LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L4x1_SAVE + +LDGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L4x1_SUB2 + +LDGEMM_L4x1_SAVE: + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 +#endif +LDGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + addic. J, J, -1 + bgt LDGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +LDGEMM_L4_END: + + b LDGEMM_L2_BEGIN + +.L999_H1: + + b .L999 + +LDGEMM_L2_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 2 + ble LDGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble LDGEMM_L2x16_END + +LDGEMM_L2x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x16_SUB4 + +LDGEMM_L2x16_LOOP_START: + + #dcbt AO, PRE + LOAD2x16_1 + #dcbt AO, PRE + KERNEL2x16_I1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble LDGEMM_L2x16_LOOP_END + + MY_ALIGN + +LDGEMM_L2x16_LOOP: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt LDGEMM_L2x16_LOOP + +LDGEMM_L2x16_LOOP_END: + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + + #dcbt AO, PRE + KERNEL2x16_1 + #dcbt AO, PRE + KERNEL2x16_2 + #dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB4: + + #dcbt AO, PRE + KERNEL2x16_SUBI1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + #dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b LDGEMM_L2x16_SUB1 + +LDGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x16_SAVE + b LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x16_SAVE + +LDGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x16_SUB2 + +LDGEMM_L2x16_SAVE: + + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt LDGEMM_L2x16_BEGIN + +LDGEMM_L2x16_END: + +LDGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L2x1_END + + andi. T1, M, 8 + ble LDGEMM_L2x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x8_SUB4 + +LDGEMM_L2x8_LOOP_START: + + #dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble LDGEMM_L2x8_LOOP_END + + MY_ALIGN + +LDGEMM_L2x8_LOOP: + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + #dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt LDGEMM_L2x8_LOOP + +LDGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b LDGEMM_L2x8_SUB1 + +LDGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x8_SAVE + b LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x8_SAVE + +LDGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x8_SUB2 + +LDGEMM_L2x8_SAVE: + + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 +#endif +LDGEMM_L2x8_END: + +LDGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x4_SUB4 + +LDGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble LDGEMM_L2x4_LOOP_END + + MY_ALIGN + +LDGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt LDGEMM_L2x4_LOOP + +LDGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b LDGEMM_L2x4_SUB1 + +LDGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x4_SAVE + b LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x4_SAVE + +LDGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x4_SUB2 + +LDGEMM_L2x4_SAVE: + + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 +#endif +LDGEMM_L2x4_END: + +LDGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x2_SUB4 + +LDGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble LDGEMM_L2x2_LOOP_END + + MY_ALIGN + +LDGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt LDGEMM_L2x2_LOOP + +LDGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b LDGEMM_L2x2_SUB1 + +LDGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x2_SAVE + b LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x2_SAVE + +LDGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x2_SUB2 + +LDGEMM_L2x2_SAVE: + + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 +#endif +LDGEMM_L2x2_END: + +LDGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L2x1_SUB4 + +LDGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble LDGEMM_L2x1_LOOP_END + + MY_ALIGN + +LDGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt LDGEMM_L2x1_LOOP + +LDGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b LDGEMM_L2x1_SUB1 + +LDGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L2x1_SAVE + b LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L2x1_SAVE + +LDGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L2x1_SUB2 + +LDGEMM_L2x1_SAVE: + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 +#endif +LDGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LDGEMM_L2_END: +LDGEMM_L1_BEGIN: + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + andi. T1, N, 1 + ble LDGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble LDGEMM_L1x16_END + +LDGEMM_L1x16_BEGIN: + + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x16_SUB4 + +LDGEMM_L1x16_LOOP_START: + + #dcbt AO, PRE + LOAD1x16_1 + #dcbt AO, PRE + KERNEL1x16_I1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble LDGEMM_L1x16_LOOP_END + + MY_ALIGN + +LDGEMM_L1x16_LOOP: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt LDGEMM_L1x16_LOOP + +LDGEMM_L1x16_LOOP_END: + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + + #dcbt AO, PRE + KERNEL1x16_1 + #dcbt AO, PRE + KERNEL1x16_2 + #dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB4: + + #dcbt AO, PRE + KERNEL1x16_SUBI1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + #dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b LDGEMM_L1x16_SUB1 + +LDGEMM_L1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x16_SAVE + b LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x16_SAVE + +LDGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x16_SUB2 + +LDGEMM_L1x16_SAVE: + + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt LDGEMM_L1x16_BEGIN + +LDGEMM_L1x16_END: + +LDGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble LDGEMM_L1x1_END + + andi. T1, M, 8 + ble LDGEMM_L1x8_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x8_SUB4 + +LDGEMM_L1x8_LOOP_START: + + #dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble LDGEMM_L1x8_LOOP_END + + MY_ALIGN + +LDGEMM_L1x8_LOOP: + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + #dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt LDGEMM_L1x8_LOOP + +LDGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b LDGEMM_L1x8_SUB1 + +LDGEMM_L1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x8_SAVE + b LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x8_SAVE + +LDGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x8_SUB2 + +LDGEMM_L1x8_SAVE: + + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 +#endif +LDGEMM_L1x8_END: + +LDGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble LDGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x4_SUB4 + +LDGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble LDGEMM_L1x4_LOOP_END + + MY_ALIGN + +LDGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt LDGEMM_L1x4_LOOP + +LDGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b LDGEMM_L1x4_SUB1 + +LDGEMM_L1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x4_SAVE + b LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x4_SAVE + +LDGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x4_SUB2 + +LDGEMM_L1x4_SAVE: + + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 +#endif +LDGEMM_L1x4_END: + +LDGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble LDGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x2_SUB4 + +LDGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble LDGEMM_L1x2_LOOP_END + + MY_ALIGN + +LDGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt LDGEMM_L1x2_LOOP + +LDGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b LDGEMM_L1x2_SUB1 + +LDGEMM_L1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x2_SAVE + b LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x2_SAVE + +LDGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x2_SUB2 + +LDGEMM_L1x2_SAVE: + + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 +#endif +LDGEMM_L1x2_END: + +LDGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble LDGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 + REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 + srawi. L, T3, 3 +#else + mr BO, B + srawi. L, K, 3 +#endif + ble LDGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble LDGEMM_L1x1_SUB4 + +LDGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble LDGEMM_L1x1_LOOP_END + + MY_ALIGN + +LDGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt LDGEMM_L1x1_LOOP + +LDGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b LDGEMM_L1x1_SUB1 + +LDGEMM_L1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble LDGEMM_L1x1_SAVE + b LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SUB1: +#if defined(TRMMKERNEL) + andi. L, T3, 7 +#else + andi. L, K, 7 +#endif + ble LDGEMM_L1x1_SAVE + +LDGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt LDGEMM_L1x1_SUB2 + +LDGEMM_L1x1_SAVE: + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 +#endif +LDGEMM_L1x1_END: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif +LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S index c4b8270b82..4eddab24fd 100644 --- a/kernel/power/dgemm_macros_power9.S +++ b/kernel/power/dgemm_macros_power9.S @@ -1,3623 +1,3623 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - -/********************************************************************* -* Macros for N=4, M=16 * -*********************************************************************/ -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - xxlxor vs36,vs36,vs36 - xxlxor vs37,vs37,vs37 - xxlxor vs38,vs38,vs38 - xxlxor vs39,vs39,vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif -.endm - - -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -.macro KERNEL4x16_L1_L2 Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete - -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 -.else - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 -.endif - lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) - lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - -.else - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 -.endif - lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - - - - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - - - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) -.endif - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) - lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 -.if \Complete==0 - lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) -.endif - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) -.endif - xvmaddadp vs52, vs12, vs30 - xvmaddadp vs53, vs13, vs30 - xvmaddadp vs54, vs14, vs30 - xvmaddadp vs55, vs15, vs30 -.if \Complete==0 - lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) -.endif - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 - - - xvmaddadp vs60, vs12, vs31 - - xvmaddadp vs61, vs13, vs31 - xvmaddadp vs62, vs14, vs31 - - xvmaddadp vs63, vs15, vs31 - .if \IsLast==1 - .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) - .else - addi \AREG, \AREG, DISP32(\Index,256) - addi \BREG, \BREG, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x16 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) - - - - addi BO, BO, 32 - addi AO, AO, 128 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 - -.endif -.endm - -.macro SAVE4x16_REGS - add C2, CO, LDC - add C3, C2, LDC - add C4, C3, LDC -.endm - -.macro SAVE4x16 -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs24, 64(CO) - lxv vs26, 80(CO) - lxv vs28, 96(CO) - lxv vs30, 112(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C2) - lxv vs3, 16(C2) - lxv vs5, 32(C2) - lxv vs7, 48(C2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs25, 64(C2) - lxv vs27, 80(C2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 -#ifndef TRMMKERNEL - lxv vs29, 96(C2) - lxv vs31, 112(C2) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - xxpermdi vs8, vs44,vs36,1 - xxpermdi vs9 ,vs36,vs44,1 - xxpermdi vs10, vs45,vs37,1 - xxpermdi vs11 ,vs37,vs45,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - xxpermdi vs12, vs46,vs38,1 - xxpermdi vs13 ,vs38,vs46,1 - xxpermdi vs14, vs47,vs39,1 - xxpermdi vs15 ,vs39,vs47,1 - -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r - -#endif - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - stxv vs24, 64(CO) - stxv vs26, 80(CO) - stxv vs28, 96(CO) - stxv vs30, 112(CO) - - stxv vs1, 0(C2) - stxv vs3, 16(C2) - stxv vs5, 32(C2) - stxv vs7, 48(C2) - - stxv vs25, 64(C2) - stxv vs27, 80(C2) - stxv vs29, 96(C2) - stxv vs31, 112(C2) -#ifndef TRMMKERNEL - lxv vs0, 0(C3) - lxv vs2, 16(C3) - lxv vs4, 32(C3) - lxv vs6, 48(C3) -#endif - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs24, 64(C3) - lxv vs26, 80(C3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs28, 96(C3) - lxv vs30, 112(C3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C4) - lxv vs3, 16(C4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(C4) - lxv vs7, 48(C4) - - lxv vs25, 64(C4) - lxv vs27, 80(C4) - lxv vs29, 96(C4) - lxv vs31, 112(C4) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - - xxpermdi vs8, vs60,vs52,1 - xxpermdi vs9 ,vs52,vs60,1 - xxpermdi vs10, vs61,vs53,1 - xxpermdi vs11 ,vs53,vs61,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - - - xxpermdi vs12, vs62,vs54,1 - xxpermdi vs13 ,vs54,vs62,1 - xxpermdi vs14, vs63,vs55,1 - xxpermdi vs15 ,vs55,vs63,1 -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r -#endif - stxv vs0, 0(C3) - stxv vs2, 16(C3) - stxv vs4, 32(C3) - stxv vs6, 48(C3) - - stxv vs24, 64(C3) - stxv vs26, 80(C3) - stxv vs28, 96(C3) - stxv vs30, 112(C3) - - stxv vs1, 0(C4) - stxv vs3, 16(C4) - stxv vs5, 32(C4) - stxv vs7, 48(C4) - - stxv vs25, 64(C4) - stxv vs27, 80(C4) - stxv vs29, 96(C4) - stxv vs31, 112(C4) - - addi CO, CO, 128 -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - -.endif -.endm - - - -.macro KERNEL4x8_L1_L2 Index,IsLast - KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index,0+\OffsetA)(AO) - lxv vs9, DISP16(\Index,16+\OffsetA)(AO) -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - - lxv vs10, DISP16(\Index,32+\OffsetA)(AO) - lxv vs11, DISP16(\Index,48+\OffsetA)(AO) - - - -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - - lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) - lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(AO) - lxv vs1, DISP16(\Index,80+\OffsetA)(AO) -.endif - - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.if \Complete==0 - lxv vs2, DISP16(\Index,96+\OffsetA)(AO) - lxv vs3, DISP16(\Index,112+\OffsetA)(AO) -.endif - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) - lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) -.endif - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - - .if \IsLast==1 - .if \Complete==1 - addi AO, AO, DISP16(\Index,64+\OffsetA) - addi BO, BO, DISP8(\Index,32+\OffsetB) - .else - addi AO, AO, DISP16(\Index,128) - addi BO, BO, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x8 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - - - addi BO, BO, 32 - addi AO, AO, 64 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - -.endif -.endm - - - -.macro SAVE4x8 - add T2, CO, LDC - add T3, T2, LDC - add T4, T3, LDC -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T2) - lxv vs3, 16(T2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T2) - lxv vs7, 48(T2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 - - - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - - stxv vs1, 0(T2) - stxv vs3, 16(T2) - stxv vs5, 32(T2) - stxv vs7, 48(T2) - - - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs0, 0(T3) - lxv vs2, 16(T3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs4, 32(T3) - lxv vs6, 48(T3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T4) - lxv vs3, 16(T4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T4) - lxv vs7, 48(T4) - - - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(T3) - stxv vs2, 16(T3) - stxv vs4, 32(T3) - stxv vs6, 48(T3) - - - stxv vs1, 0(T4) - stxv vs3, 16(T4) - stxv vs5, 32(T4) - stxv vs7, 48(T4) - - - - addi CO, CO, 64 -.endm - - -/********************************************************************* -* Macros for N=4, M=4 * -*********************************************************************/ - -.macro LOAD4x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro SAVE4x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=4, M=2 * -*********************************************************************/ - -.macro LOAD4x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r -#else - xvmuldp vs0, vs48, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r -#else - xvmuldp vs8, vs56, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=4, M=1 * -*********************************************************************/ - -.macro LOAD4x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs48, alpha_r -#else - xsmuldp vs0, vs48, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs56, alpha_r -#else - xsmuldp vs8, vs56, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=2, M=16 * -*********************************************************************/ - -.macro LOAD2x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL2x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro SAVE2x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD2x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro SAVE2x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=2, M=4 * -*********************************************************************/ - -.macro LOAD2x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=2, M=2 * -*********************************************************************/ - -.macro LOAD2x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=2, M=1 * -*********************************************************************/ - -.macro LOAD2x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=1, M=16 * -*********************************************************************/ - -.macro LOAD1x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL1x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro SAVE1x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD1x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro SAVE1x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=1, M=4 * -*********************************************************************/ - -.macro LOAD1x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=1, M=2 * -*********************************************************************/ - -.macro LOAD1x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_E2 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=1, M=1 * -*********************************************************************/ - -.macro LOAD1x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_E2 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - addi CO, CO, 8 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* Abdelrauf(quickwritereader@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/********************************************************************* +* Macros for N=4, M=16 * +*********************************************************************/ +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + xxlxor vs36,vs36,vs36 + xxlxor vs37,vs37,vs37 + xxlxor vs38,vs38,vs38 + xxlxor vs39,vs39,vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endif +.endm + + +#define unit_size 8 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +.macro KERNEL4x16_L1_L2 Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete + +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 +.else + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 +.endif + lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) + lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + +.else + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 +.endif + lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + + + + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + + + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) +.endif + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) + lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 +.if \Complete==0 + lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) +.endif + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) +.endif + xvmaddadp vs52, vs12, vs30 + xvmaddadp vs53, vs13, vs30 + xvmaddadp vs54, vs14, vs30 + xvmaddadp vs55, vs15, vs30 +.if \Complete==0 + lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) +.endif + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 + + + xvmaddadp vs60, vs12, vs31 + + xvmaddadp vs61, vs13, vs31 + xvmaddadp vs62, vs14, vs31 + + xvmaddadp vs63, vs15, vs31 + .if \IsLast==1 + .if \Complete==1 + addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) + .else + addi \AREG, \AREG, DISP32(\Index,256) + addi \BREG, \BREG, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x16 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + lxv vs4, 64(AO) + lxv vs5, 80(AO) + lxv vs6, 96(AO) + lxv vs7, 112(AO) + + + + addi BO, BO, 32 + addi AO, AO, 128 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + xvmuldp vs52, vs4, vs26 + xvmuldp vs53, vs5, vs26 + xvmuldp vs54, vs6, vs26 + xvmuldp vs55, vs7, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + xvmuldp vs60, vs4, vs27 + xvmuldp vs61, vs5, vs27 + xvmuldp vs62, vs6, vs27 + xvmuldp vs63, vs7, vs27 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + xvmaddadp vs52, vs4, vs26 + xvmaddadp vs53, vs5, vs26 + xvmaddadp vs54, vs6, vs26 + xvmaddadp vs55, vs7, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + xvmaddadp vs60, vs4, vs27 + xvmaddadp vs61, vs5, vs27 + xvmaddadp vs62, vs6, vs27 + xvmaddadp vs63, vs7, vs27 + +.endif +.endm + +.macro SAVE4x16_REGS + add C2, CO, LDC + add C3, C2, LDC + add C4, C3, LDC +.endm + +.macro SAVE4x16 +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs24, 64(CO) + lxv vs26, 80(CO) + lxv vs28, 96(CO) + lxv vs30, 112(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C2) + lxv vs3, 16(C2) + lxv vs5, 32(C2) + lxv vs7, 48(C2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs25, 64(C2) + lxv vs27, 80(C2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 +#ifndef TRMMKERNEL + lxv vs29, 96(C2) + lxv vs31, 112(C2) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + xxpermdi vs8, vs44,vs36,1 + xxpermdi vs9 ,vs36,vs44,1 + xxpermdi vs10, vs45,vs37,1 + xxpermdi vs11 ,vs37,vs45,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + xxpermdi vs12, vs46,vs38,1 + xxpermdi vs13 ,vs38,vs46,1 + xxpermdi vs14, vs47,vs39,1 + xxpermdi vs15 ,vs39,vs47,1 + +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r + +#endif + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + stxv vs24, 64(CO) + stxv vs26, 80(CO) + stxv vs28, 96(CO) + stxv vs30, 112(CO) + + stxv vs1, 0(C2) + stxv vs3, 16(C2) + stxv vs5, 32(C2) + stxv vs7, 48(C2) + + stxv vs25, 64(C2) + stxv vs27, 80(C2) + stxv vs29, 96(C2) + stxv vs31, 112(C2) +#ifndef TRMMKERNEL + lxv vs0, 0(C3) + lxv vs2, 16(C3) + lxv vs4, 32(C3) + lxv vs6, 48(C3) +#endif + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs24, 64(C3) + lxv vs26, 80(C3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs28, 96(C3) + lxv vs30, 112(C3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(C4) + lxv vs3, 16(C4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(C4) + lxv vs7, 48(C4) + + lxv vs25, 64(C4) + lxv vs27, 80(C4) + lxv vs29, 96(C4) + lxv vs31, 112(C4) +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + +#endif + + xxpermdi vs8, vs60,vs52,1 + xxpermdi vs9 ,vs52,vs60,1 + xxpermdi vs10, vs61,vs53,1 + xxpermdi vs11 ,vs53,vs61,1 +#ifndef TRMMKERNEL + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r +#endif + + + xxpermdi vs12, vs62,vs54,1 + xxpermdi vs13 ,vs54,vs62,1 + xxpermdi vs14, vs63,vs55,1 + xxpermdi vs15 ,vs55,vs63,1 +#ifndef TRMMKERNEL + xvmaddadp vs24, vs8, alpha_r + xvmaddadp vs25, vs9, alpha_r + xvmaddadp vs26, vs10, alpha_r + xvmaddadp vs27, vs11, alpha_r + + xvmaddadp vs28, vs12, alpha_r + xvmaddadp vs29, vs13, alpha_r + xvmaddadp vs30, vs14, alpha_r + xvmaddadp vs31, vs15, alpha_r +#else + xvmuldp vs24, vs8, alpha_r + xvmuldp vs25, vs9, alpha_r + xvmuldp vs26, vs10, alpha_r + xvmuldp vs27, vs11, alpha_r + + xvmuldp vs28, vs12, alpha_r + xvmuldp vs29, vs13, alpha_r + xvmuldp vs30, vs14, alpha_r + xvmuldp vs31, vs15, alpha_r +#endif + stxv vs0, 0(C3) + stxv vs2, 16(C3) + stxv vs4, 32(C3) + stxv vs6, 48(C3) + + stxv vs24, 64(C3) + stxv vs26, 80(C3) + stxv vs28, 96(C3) + stxv vs30, 112(C3) + + stxv vs1, 0(C4) + stxv vs3, 16(C4) + stxv vs5, 32(C4) + stxv vs7, 48(C4) + + stxv vs25, 64(C4) + stxv vs27, 80(C4) + stxv vs29, 96(C4) + stxv vs31, 112(C4) + + addi CO, CO, 128 +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + +.if \Zero==1 + xxlxor vs32,vs32,vs32 + xxlxor vs33,vs33,vs33 + xxlxor vs34,vs34,vs34 + xxlxor vs35,vs35,vs35 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + +.endif +.endm + + + +.macro KERNEL4x8_L1_L2 Index,IsLast + KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 +.endm + + + +.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index,0+\OffsetA)(AO) + lxv vs9, DISP16(\Index,16+\OffsetA)(AO) +.if \First ==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 +.endif + + lxv vs10, DISP16(\Index,32+\OffsetA)(AO) + lxv vs11, DISP16(\Index,48+\OffsetA)(AO) + + + +.if \First ==1 + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + +.else + + lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) + lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + +.endif + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs31, vs30, vs30,2 +.if \First ==1 + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + +.endif + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 +.if \Complete==0 + lxv vs0, DISP16(\Index,64+\OffsetA)(AO) + lxv vs1, DISP16(\Index,80+\OffsetA)(AO) +.endif + + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.if \Complete==0 + lxv vs2, DISP16(\Index,96+\OffsetA)(AO) + lxv vs3, DISP16(\Index,112+\OffsetA)(AO) +.endif + + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + xvmaddadp vs50, vs10, vs30 + xvmaddadp vs51, vs11, vs30 +.if \Complete==0 + lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) + lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) +.endif + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + xvmaddadp vs58, vs10, vs31 + xvmaddadp vs59, vs11, vs31 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 +.endif + + .if \IsLast==1 + .if \Complete==1 + addi AO, AO, DISP16(\Index,64+\OffsetA) + addi BO, BO, DISP8(\Index,32+\OffsetB) + .else + addi AO, AO, DISP16(\Index,128) + addi BO, BO, DISP8(\Index,64) + .endif + .endif + + +.endm + + + +.macro KERNEL4x8 First + + lxv vs24, 0(BO) + lxv vs26, 16(BO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + + + + + addi BO, BO, 32 + addi AO, AO, 64 + +.if \First==1 + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + xvmuldp vs50, vs2, vs26 + xvmuldp vs51, vs3, vs26 + + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + xvmuldp vs58, vs2, vs27 + xvmuldp vs59, vs3, vs27 + +.else + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + + + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + xvmaddadp vs50, vs2, vs26 + xvmaddadp vs51, vs3, vs26 + + + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + xvmaddadp vs58, vs2, vs27 + xvmaddadp vs59, vs3, vs27 + + +.endif +.endm + + + +.macro SAVE4x8 + add T2, CO, LDC + add T3, T2, LDC + add T4, T3, LDC +#ifndef TRMMKERNEL + lxv vs0, 0(CO) + lxv vs2, 16(CO) +#endif + xxpermdi vs8, vs40,vs32,1 + xxpermdi vs9 ,vs32,vs40,1 +#ifndef TRMMKERNEL + lxv vs4, 32(CO) + lxv vs6, 48(CO) +#endif + xxpermdi vs10, vs41,vs33,1 + xxpermdi vs11 ,vs33,vs41,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T2) + lxv vs3, 16(T2) +#endif + xxpermdi vs12, vs42,vs34,1 + xxpermdi vs13 ,vs34,vs42,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T2) + lxv vs7, 48(T2) +#endif + xxpermdi vs14, vs43,vs35,1 + xxpermdi vs15 ,vs35,vs43,1 + + + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) + + + stxv vs1, 0(T2) + stxv vs3, 16(T2) + stxv vs5, 32(T2) + stxv vs7, 48(T2) + + + xxpermdi vs8, vs56,vs48,1 + xxpermdi vs9 ,vs48,vs56,1 +#ifndef TRMMKERNEL + lxv vs0, 0(T3) + lxv vs2, 16(T3) +#endif + xxpermdi vs10, vs57,vs49,1 + xxpermdi vs11 ,vs49,vs57,1 +#ifndef TRMMKERNEL + lxv vs4, 32(T3) + lxv vs6, 48(T3) +#endif + xxpermdi vs12, vs58,vs50,1 + xxpermdi vs13 ,vs50,vs58,1 +#ifndef TRMMKERNEL + lxv vs1, 0(T4) + lxv vs3, 16(T4) +#endif + xxpermdi vs14, vs59,vs51,1 + xxpermdi vs15 ,vs51,vs59,1 +#ifndef TRMMKERNEL + lxv vs5, 32(T4) + lxv vs7, 48(T4) + + + xvmaddadp vs0, vs8, alpha_r + xvmaddadp vs1, vs9, alpha_r + xvmaddadp vs2, vs10, alpha_r + xvmaddadp vs3, vs11, alpha_r + + + + xvmaddadp vs4, vs12, alpha_r + xvmaddadp vs5, vs13, alpha_r + xvmaddadp vs6, vs14, alpha_r + xvmaddadp vs7, vs15, alpha_r +#else + xvmuldp vs0, vs8, alpha_r + xvmuldp vs1, vs9, alpha_r + xvmuldp vs2, vs10, alpha_r + xvmuldp vs3, vs11, alpha_r + + + + xvmuldp vs4, vs12, alpha_r + xvmuldp vs5, vs13, alpha_r + xvmuldp vs6, vs14, alpha_r + xvmuldp vs7, vs15, alpha_r + +#endif + + + stxv vs0, 0(T3) + stxv vs2, 16(T3) + stxv vs4, 32(T3) + stxv vs6, 48(T3) + + + stxv vs1, 0(T4) + stxv vs3, 16(T4) + stxv vs5, 32(T4) + stxv vs7, 48(T4) + + + + addi CO, CO, 64 +.endm + + +/********************************************************************* +* Macros for N=4, M=4 * +*********************************************************************/ + +.macro LOAD4x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + + xvmaddadp vs48, vs8, vs30 + xvmaddadp vs49, vs9, vs30 + + xvmaddadp vs56, vs8, vs31 + xvmaddadp vs57, vs9, vs31 + +.endm + +.macro KERNEL4x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + + xvmuldp vs48, vs0, vs26 + xvmuldp vs49, vs1, vs26 + + xvmuldp vs56, vs0, vs27 + xvmuldp vs57, vs1, vs27 + +.endm + +.macro KERNEL4x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 32 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + + xvmaddadp vs48, vs0, vs26 + xvmaddadp vs49, vs1, vs26 + + xvmaddadp vs56, vs0, vs27 + xvmaddadp vs57, vs1, vs27 + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r + xvmaddadp vs1, vs49, alpha_r +#else + xvmuldp vs0, vs48, alpha_r + xvmuldp vs1, vs49, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r + xvmaddadp vs9, vs57, alpha_r +#else + xvmuldp vs8, vs56, alpha_r + xvmuldp vs9, vs57, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=4, M=2 * +*********************************************************************/ + +.macro LOAD4x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + lxvdsx vs30, o16, BO + lxvdsx vs31, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + + xvmaddadp vs48, vs8, vs30 + + xvmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + + xvmuldp vs48, vs0, vs26 + + xvmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + lxvdsx vs26, o16, BO + lxvdsx vs27, o24, BO + + addi AO, AO, 16 + addi BO, BO, 32 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + + xvmaddadp vs48, vs0, vs26 + + xvmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs48, alpha_r +#else + xvmuldp vs0, vs48, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs56, alpha_r +#else + xvmuldp vs8, vs56, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=4, M=1 * +*********************************************************************/ + +.macro LOAD4x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + lxsdx vs30, o16, BO + lxsdx vs31, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + + xsmaddadp vs48, vs8, vs30 + + xsmaddadp vs56, vs8, vs31 + +.endm + +.macro KERNEL4x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + + xsmuldp vs48, vs0, vs26 + + xsmuldp vs56, vs0, vs27 + +.endm + +.macro KERNEL4x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + lxsdx vs26, o16, BO + lxsdx vs27, o24, BO + + addi AO, AO, 8 + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + + xsmaddadp vs48, vs0, vs26 + + xsmaddadp vs56, vs0, vs27 + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs48, alpha_r +#else + xsmuldp vs0, vs48, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs56, alpha_r +#else + xsmuldp vs8, vs56, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=2, M=16 * +*********************************************************************/ + +.macro LOAD2x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL2x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + xvmaddadp vs44, vs12, vs29 + xvmaddadp vs45, vs13, vs29 + xvmaddadp vs46, vs14, vs29 + xvmaddadp vs47, vs15, vs29 + +.endm + +.macro KERNEL2x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + xvmuldp vs44, vs4, vs25 + xvmuldp vs45, vs5, vs25 + xvmuldp vs46, vs6, vs25 + xvmuldp vs47, vs7, vs25 + +.endm + +.macro KERNEL2x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + xvmaddadp vs44, vs4, vs25 + xvmaddadp vs45, vs5, vs25 + xvmaddadp vs46, vs6, vs25 + xvmaddadp vs47, vs7, vs25 + +.endm + +.macro SAVE2x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 + + lxvd2x vs12, 0, T2 + lxvd2x vs13, o16, T2 + lxvd2x vs14, o32, T2 + lxvd2x vs15, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r + xvmaddadp vs12, vs44, alpha_r + xvmaddadp vs13, vs45, alpha_r + xvmaddadp vs14, vs46, alpha_r + xvmaddadp vs15, vs47, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r + xvmuldp vs12, vs44, alpha_r + xvmuldp vs13, vs45, alpha_r + xvmuldp vs14, vs46, alpha_r + xvmuldp vs15, vs47, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + stxvd2x vs12, 0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD2x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + xvmaddadp vs42, vs10, vs29 + xvmaddadp vs43, vs11, vs29 + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + xvmuldp vs42, vs2, vs25 + xvmuldp vs43, vs3, vs25 + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 64 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + xvmaddadp vs42, vs2, vs25 + xvmaddadp vs43, vs3, vs25 + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 + lxvd2x vs10, o32, T1 + lxvd2x vs11, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r + xvmaddadp vs10, vs42, alpha_r + xvmaddadp vs11, vs43, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r + xvmuldp vs10, vs42, alpha_r + xvmuldp vs11, vs43, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=2, M=4 * +*********************************************************************/ + +.macro LOAD2x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + + xvmaddadp vs40, vs8, vs29 + xvmaddadp vs41, vs9, vs29 + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + + xvmuldp vs40, vs0, vs25 + xvmuldp vs41, vs1, vs25 + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 32 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + + xvmaddadp vs40, vs0, vs25 + xvmaddadp vs41, vs1, vs25 + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 + lxvd2x vs9, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r + xvmaddadp vs9, vs41, alpha_r +#else + xvmuldp vs8, vs40, alpha_r + xvmuldp vs9, vs41, alpha_r +#endif + + stxvd2x vs8, 0, T1 + stxvd2x vs9, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=2, M=2 * +*********************************************************************/ + +.macro LOAD2x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + lxvdsx vs29, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs28 + + xvmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmuldp vs32, vs0, vs24 + + xvmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + lxvdsx vs25, o8, BO + + addi AO, AO, 16 + addi BO, BO, 16 + + + xvmaddadp vs32, vs0, vs24 + + xvmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxvd2x vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs8, vs40, alpha_r +#else + xvmuldp vs8, vs40, alpha_r +#endif + + stxvd2x vs8, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=2, M=1 * +*********************************************************************/ + +.macro LOAD2x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + lxsdx vs29, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs8, vs28 + + xsmaddadp vs40, vs8, vs29 + +.endm + +.macro KERNEL2x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs24 + + xsmuldp vs40, vs0, vs25 + +.endm + +.macro KERNEL2x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + lxsdx vs25, o8, BO + + addi AO, AO, 8 + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs24 + + xsmaddadp vs40, vs0, vs25 + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + lxsdx vs8, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs8, vs40, alpha_r +#else + xsmuldp vs8, vs40, alpha_r +#endif + + stxsdx vs8, 0, T1 + + addi CO, CO, 8 + +.endm + +/********************************************************************* +* Macros for N=1, M=16 * +*********************************************************************/ + +.macro LOAD1x16_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + +.endm + +.macro KERNEL1x16_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs12, 0, AO + lxvd2x vs13, o16, AO + lxvd2x vs14, o32, AO + lxvd2x vs15, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + xvmaddadp vs36, vs12, vs28 + xvmaddadp vs37, vs13, vs28 + xvmaddadp vs38, vs14, vs28 + xvmaddadp vs39, vs15, vs28 + +.endm + +.macro KERNEL1x16_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + xvmuldp vs36, vs4, vs24 + xvmuldp vs37, vs5, vs24 + xvmuldp vs38, vs6, vs24 + xvmuldp vs39, vs7, vs24 + +.endm + +.macro KERNEL1x16_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + lxvd2x vs4, 0, AO + lxvd2x vs5, o16, AO + lxvd2x vs6, o32, AO + lxvd2x vs7, o48, AO + + addi AO, AO, 64 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + xvmaddadp vs36, vs4, vs24 + xvmaddadp vs37, vs5, vs24 + xvmaddadp vs38, vs6, vs24 + xvmaddadp vs39, vs7, vs24 + +.endm + +.macro SAVE1x16 + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 + + lxvd2x vs4, 0, T2 + lxvd2x vs5, o16, T2 + lxvd2x vs6, o32, T2 + lxvd2x vs7, o48, T2 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r + xvmaddadp vs4, vs36, alpha_r + xvmaddadp vs5, vs37, alpha_r + xvmaddadp vs6, vs38, alpha_r + xvmaddadp vs7, vs39, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r + xvmuldp vs4, vs36, alpha_r + xvmuldp vs5, vs37, alpha_r + xvmuldp vs6, vs38, alpha_r + xvmuldp vs7, vs39, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + stxvd2x vs4, 0, T2 + stxvd2x vs5, o16, T2 + stxvd2x vs6, o32, T2 + stxvd2x vs7, o48, T2 + + addi CO, CO, 128 + +.endm + +/********************************************************************* +* Macros for N=4, M=8 * +*********************************************************************/ + +.macro LOAD1x8_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + lxvd2x vs10, o32, AO + lxvd2x vs11, o48, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + xvmaddadp vs34, vs10, vs28 + xvmaddadp vs35, vs11, vs28 + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + xvmuldp vs34, vs2, vs24 + xvmuldp vs35, vs3, vs24 + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + lxvd2x vs2, o32, AO + lxvd2x vs3, o48, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 64 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + xvmaddadp vs34, vs2, vs24 + xvmaddadp vs35, vs3, vs24 + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 + lxvd2x vs2, o32, T1 + lxvd2x vs3, o48, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r + xvmaddadp vs2, vs34, alpha_r + xvmaddadp vs3, vs35, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r + xvmuldp vs2, vs34, alpha_r + xvmuldp vs3, vs35, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 + + addi CO, CO, 64 + +.endm + +/********************************************************************* +* Macros for N=1, M=4 * +*********************************************************************/ + +.macro LOAD1x4_1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, 0, AO + lxvd2x vs9, o16, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs28 + xvmaddadp vs33, vs9, vs28 + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + xvmuldp vs33, vs1, vs24 + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, 0, AO + lxvd2x vs1, o16, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 32 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + xvmaddadp vs33, vs1, vs24 + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 + lxvd2x vs1, o16, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r + xvmaddadp vs1, vs33, alpha_r +#else + xvmuldp vs0, vs32, alpha_r + xvmuldp vs1, vs33, alpha_r +#endif + + stxvd2x vs0, 0, T1 + stxvd2x vs1, o16, T1 + + addi CO, CO, 32 + +.endm + +/********************************************************************* +* Macros for N=1, M=2 * +*********************************************************************/ + +.macro LOAD1x2_1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, 0, AO + + lxvdsx vs28, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, 0, AO + + lxvdsx vs24, 0, BO + + addi AO, AO, 16 + addi BO, BO, 8 + + + xvmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + lxvd2x vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xvmaddadp vs0, vs32, alpha_r +#else + xvmuldp vs0, vs32, alpha_r +#endif + + stxvd2x vs0, 0, T1 + + addi CO, CO, 16 + +.endm + +/********************************************************************* +* Macros for N=1, M=1 * +*********************************************************************/ + +.macro LOAD1x1_1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_1 + + lxsdx vs8, 0, AO + + lxsdx vs28, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_2 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs8, vs28 + +.endm + +.macro KERNEL1x1_SUBI1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs24 + +.endm + +.macro KERNEL1x1_SUB1 + + lxsdx vs0, 0, AO + + lxsdx vs24, 0, BO + + addi AO, AO, 8 + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs24 + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + lxsdx vs0, 0, T1 +#endif + +#ifndef TRMMKERNEL + xsmaddadp vs0, vs32, alpha_r +#else + xsmuldp vs0, vs32, alpha_r +#endif + + stxsdx vs0, 0, T1 + + addi CO, CO, 8 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index bd74d20e50..58dcdec5af 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -1,328 +1,328 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - -#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code - -#if !defined(USE_MASK_PERMUTATIONS) - -static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgew %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgow %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { - - BLASLONG index; - BLASLONG i=0; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned int static_index0 = {0,1,2,3}; -#else - register __vector unsigned int static_index0 = {2,0,3,1}; -#endif - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0; - register __vector unsigned int static_index2=static_index0 +temp1; - register __vector unsigned int static_index3=static_index1 +temp1; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - - register __vector float * v_ptrx=(__vector float *)x; -#if defined(USE_MASK_PERMUTATIONS) - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; -#endif - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vv0,vf0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(vv0,quadruple_values); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the maximum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - - - - - - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - max = ciamax_kernel_32(n1, x, &maxf); - i = n1; - ix = n1 << 1; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); - - } else { - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (max + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code + +#if !defined(USE_MASK_PERMUTATIONS) + +static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgew %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ + __vector float result; + __asm__ ( + "vmrgow %0,%1,%2;\n" + : "=v" (result) + : "v" (a), + "v" (b) + : ); + return result; +} + +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { + + BLASLONG index; + BLASLONG i=0; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned int static_index0 = {0,1,2,3}; +#else + register __vector unsigned int static_index0 = {2,0,3,1}; +#endif + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0; + register __vector unsigned int static_index2=static_index0 +temp1; + register __vector unsigned int static_index3=static_index1 +temp1; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + + register __vector float * v_ptrx=(__vector float *)x; +#if defined(USE_MASK_PERMUTATIONS) + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; +#endif + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vv0,vf0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(vv0,quadruple_values); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the maximum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + + + + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = ciamax_kernel_32(n1, x, &maxf); + i = n1; + ix = n1 << 1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } + +} + + diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c index 336766245f..843370c6c7 100644 --- a/kernel/power/icamin.c +++ b/kernel/power/icamin.c @@ -1,266 +1,266 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - float first_min=CABS1(x,0); - register __vector float quadruple_values={first_min,first_min,first_min,first_min}; - - register __vector float * v_ptrx=(__vector float *)x; - register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; - register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; - for(; i31 - - //find final quadruple from 32 elements - r2=vec_cmpgt(vf0,vv0); - ind2 = vec_sel( indf0,indv0,r2); - vv0= vec_sel(vf0,vv0,r2); - //get asbolute index - ind2+=temp0; - //compare with old quadruple and update - r1=vec_cmpgt(quadruple_values,vv0); - quadruple_indices = vec_sel( quadruple_indices,ind2,r1); - quadruple_values= vec_sel(quadruple_values,vv0,r1); - - temp0+=temp_add; - } - - //now we have to chose from 4 values and 4 different indices - // we will compare pairwise if pairs are exactly the same we will choose minimum between index - // otherwise we will assign index of the minimum value - float a1,a2,a3,a4; - unsigned int i1,i2,i3,i4; - a1=vec_extract(quadruple_values,0); - a2=vec_extract(quadruple_values,1); - a3=vec_extract(quadruple_values,2); - a4=vec_extract(quadruple_values,3); - i1=vec_extract(quadruple_indices,0); - i2=vec_extract(quadruple_indices,1); - i3=vec_extract(quadruple_indices,2); - i4=vec_extract(quadruple_indices,3); - if(a1==a2){ - index=i1>i2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = ciamin_kernel_32(n1, x, &minf); - i = n1; - ix = n1 << 1; - } - - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); - - } else { - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (min + 1); - } - -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + + + + +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + float first_min=CABS1(x,0); + register __vector float quadruple_values={first_min,first_min,first_min,first_min}; + + register __vector float * v_ptrx=(__vector float *)x; + register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; + register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + for(; i31 + + //find final quadruple from 32 elements + r2=vec_cmpgt(vf0,vv0); + ind2 = vec_sel( indf0,indv0,r2); + vv0= vec_sel(vf0,vv0,r2); + //get asbolute index + ind2+=temp0; + //compare with old quadruple and update + r1=vec_cmpgt(quadruple_values,vv0); + quadruple_indices = vec_sel( quadruple_indices,ind2,r1); + quadruple_values= vec_sel(quadruple_values,vv0,r1); + + temp0+=temp_add; + } + + //now we have to chose from 4 values and 4 different indices + // we will compare pairwise if pairs are exactly the same we will choose minimum between index + // otherwise we will assign index of the minimum value + float a1,a2,a3,a4; + unsigned int i1,i2,i3,i4; + a1=vec_extract(quadruple_values,0); + a2=vec_extract(quadruple_values,1); + a3=vec_extract(quadruple_values,2); + a4=vec_extract(quadruple_values,3); + i1=vec_extract(quadruple_indices,0); + i2=vec_extract(quadruple_indices,1); + i3=vec_extract(quadruple_indices,2); + i4=vec_extract(quadruple_indices,3); + if(a1==a2){ + index=i1>i2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = ciamin_kernel_32(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } + +} + + diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c index bf1af78d6d..fb2dafec0f 100644 --- a/kernel/power/isamax.c +++ b/kernel/power/isamax.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include - - -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} - register __vector float quadruple_values={0,0,0,0}; - register __vector float * v_ptrx=(__vector float *)x; - for(; ii2?i2:i1; - }else if(a2>a1){ - index=i2; - a1=a2; - }else{ - index= i1; - } - - if(a4==a3){ - i1=i3>i4?i4:i3; - }else if(a4>a3){ - i1=i4; - a3=a4; - }else{ - i1= i3; - } - - if(a1==a3){ - index=i1>index?index:i1; - *maxf=a1; - }else if(a3>a1){ - index=i1; - *maxf=a3; - }else{ - *maxf=a1; - } - return index; - -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - max = siamax_kernel_64(n1, x, &maxf); - - i = n1; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include + + +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif + +/** + * Find maximum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} + register __vector float quadruple_values={0,0,0,0}; + register __vector float * v_ptrx=(__vector float *)x; + for(; ii2?i2:i1; + }else if(a2>a1){ + index=i2; + a1=a2; + }else{ + index= i1; + } + + if(a4==a3){ + i1=i3>i4?i4:i3; + }else if(a4>a3){ + i1=i4; + a3=a4; + }else{ + i1= i3; + } + + if(a1==a3){ + index=i1>index?index:i1; + *maxf=a1; + }else if(a3>a1){ + index=i1; + *maxf=a3; + }else{ + *maxf=a1; + } + return index; + +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = siamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c index 1c1f0ad788..60c843f580 100644 --- a/kernel/power/isamin.c +++ b/kernel/power/isamin.c @@ -1,288 +1,288 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#include "common.h" -#include -#include -#if defined(DOUBLE) - #define ABS fabs -#else - #define ABS fabsf -#endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 64 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return index - */ -static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - BLASLONG i=0; - register __vector unsigned int static_index0 = {0,1,2,3}; - register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register - register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; - register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; - register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; - temp0=vec_xor(temp0,temp0); - temp1=temp1 <<1 ; //{16,16,16,16} - register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; - register __vector float * v_ptrx=(__vector float *)x; - register __vector float quadruple_values=vec_abs(v_ptrx[0]); - for(; ii2?i2:i1; - }else if(a2i4?i4:i3; - }else if(a4index?index:i1; - *minf=a1; - }else if(a3 0) { - - min = siamin_kernel_64(n1, x, &minf); - i = n1; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); - - } else { - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); - } -} +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#include "common.h" +#include +#include +#if defined(DOUBLE) + #define ABS fabs +#else + #define ABS fabsf +#endif +/** + * Find minimum index + * Warning: requirements n>0 and n % 64 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return index + */ +static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + BLASLONG i=0; + register __vector unsigned int static_index0 = {0,1,2,3}; + register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register + register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; + temp0=vec_xor(temp0,temp0); + temp1=temp1 <<1 ; //{16,16,16,16} + register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; + register __vector float * v_ptrx=(__vector float *)x; + register __vector float quadruple_values=vec_abs(v_ptrx[0]); + for(; ii2?i2:i1; + }else if(a2i4?i4:i3; + }else if(a4index?index:i1; + *minf=a1; + }else if(a3 0) { + + min = siamin_kernel_64(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S index 7a0f3143e6..5cdc83d870 100644 --- a/kernel/power/sgemm_kernel_power9.S +++ b/kernel/power/sgemm_kernel_power9.S @@ -1,272 +1,272 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs20 -#define save_permute_1 vs21 -#define save_permute_2 vs22 -#define permute_mask vs23 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define T11 r29 - -#define T12 r30 -#define T13 r31 - -#include "sgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_11, 0x1415161718191a1b -.equ save_permute_12, 0x0405060708090a0b -.equ save_permute_21, 0x101112131c1d1e1f -.equ save_permute_22, 0x000102030c0d0e0f - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - slwi LDC, LDC, 2 - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xxspltw alpha_r,alpha_r,0 - -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - lis T5, save_permute_22@highest - lis T6, save_permute_21@highest - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - ori T5, T5, save_permute_22@higher - ori T6, T6, save_permute_21@higher - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - rldicr T5, T5, 32, 31 - rldicr T6, T6, 32, 31 - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - oris T5, T5, save_permute_22@h - oris T6, T6, save_permute_21@h - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - ori T5, T5, save_permute_22@l - ori T6, T6, save_permute_21@l - li r0,0 - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - mtvsrdd save_permute_2,T5,T6 - -#include "sgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 + + + +#define alpha_r vs20 +#define save_permute_1 vs21 +#define save_permute_2 vs22 +#define permute_mask vs23 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define T11 r29 + +#define T12 r30 +#define T13 r31 + +#include "sgemm_macros_power9.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_11, 0x1415161718191a1b +.equ save_permute_12, 0x0405060708090a0b +.equ save_permute_21, 0x101112131c1d1e1f +.equ save_permute_22, 0x000102030c0d0e0f + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + +#if defined(TRMMKERNEL) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + slwi LDC, LDC, 2 + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xxspltw alpha_r,alpha_r,0 + +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + lis T5, save_permute_22@highest + lis T6, save_permute_21@highest + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + ori T5, T5, save_permute_22@higher + ori T6, T6, save_permute_21@higher + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + rldicr T5, T5, 32, 31 + rldicr T6, T6, 32, 31 + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + oris T5, T5, save_permute_22@h + oris T6, T6, save_permute_21@h + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + ori T5, T5, save_permute_22@l + ori T6, T6, save_permute_21@l + li r0,0 + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + mtvsrdd save_permute_2,T5,T6 + +#include "sgemm_logic_power9.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S index a34ed32b8e..4022959e2b 100644 --- a/kernel/power/sgemm_logic_power9.S +++ b/kernel/power/sgemm_logic_power9.S @@ -1,2192 +1,2192 @@ -#define MY_ALIGN .align 3 -b L8 - - MY_ALIGN -LSGEMM_L8x16_LMAIN_SUB: - LOAD8x16_2 - MY_ALIGN - -LSGEMM_L8x16_LOOP: - KERNEL8x16_L2 128,64,0,0 -LSGEMM_L8x16_K128: - KERNEL8x16_L2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64, 3,0 - KERNEL8x16_I1_L4_2 128,64, 4,0 - KERNEL8x16_I1_L4_2 128,64, 5,0 - KERNEL8x16_I1_L4_2 128,64, 6,0 - KERNEL8x16_I1_L4_2 128,64, 7,0 - KERNEL8x16_I1_L4_2 128,64, 8,0 - KERNEL8x16_I1_L4_2 128,64, 9,0 - KERNEL8x16_I1_L4_2 128,64, 10,0 - KERNEL8x16_I1_L4_2 128,64, 11,0 - KERNEL8x16_I1_L4_2 128,64, 12,0 - KERNEL8x16_I1_L4_2 128,64, 13,0 - KERNEL8x16_I1_L4_2 128,64, 14,0 - KERNEL8x16_I1_L4_2 128,64, 15,0 - KERNEL8x16_I1_L4_2 128,64, 16,0 - KERNEL8x16_I1_L4_2 128,64, 17,0 - KERNEL8x16_I1_L4_2 128,64, 18,0 - KERNEL8x16_I1_L4_2 128,64, 19,0 - KERNEL8x16_I1_L4_2 128,64, 20,0 - KERNEL8x16_I1_L4_2 128,64, 21,0 - KERNEL8x16_I1_L4_2 128,64, 22,0 - KERNEL8x16_I1_L4_2 128,64, 23,0 - KERNEL8x16_I1_L4_2 128,64, 24,0 - KERNEL8x16_I1_L4_2 128,64, 25,0 - KERNEL8x16_I1_L4_2 128,64, 26,0 - KERNEL8x16_I1_L4_2 128,64, 27,0 - KERNEL8x16_I1_L4_2 128,64, 28,0 - KERNEL8x16_I1_L4_2 128,64, 29,0 - KERNEL8x16_I1_L4_2 128,64, 30,0 - KERNEL8x16_I1_L4_2 128,64, 31,1 - bdnz LSGEMM_L8x16_LOOP - - MY_ALIGN -LSGEMM_L8x16_LOOP_END: - END8x16_2 - blr - - MY_ALIGN -LSGEMM_L8x16_L64_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_2 128,64, 1,0 - KERNEL8x16_I1_L4_2 128,64, 2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_2 128,64,7,0 - KERNEL8x16_I1_L4_2 128,64,8,0 - KERNEL8x16_I1_L4_2 128,64,9,0 - KERNEL8x16_I1_L4_2 128,64,10,0 - KERNEL8x16_I1_L4_2 128,64,11,0 - KERNEL8x16_I1_L4_2 128,64,12,0 - KERNEL8x16_I1_L4_2 128,64,13,0 - KERNEL8x16_I1_L4_2 128,64,14,0 - KERNEL8x16_I1_L4_3 128,64,15,1 - blr -LSGEMM_L8x16_L32_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_2 128,64,3,0 - KERNEL8x16_I1_L4_2 128,64,4,0 - KERNEL8x16_I1_L4_2 128,64,5,0 - KERNEL8x16_I1_L4_2 128,64,6,0 - KERNEL8x16_I1_L4_3 128,64,7,1 - blr - -LSGEMM_L8x16_L16_SUB: - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64,0,0 - KERNEL8x16_I1_L4_2 128,64,1,0 - KERNEL8x16_I1_L4_2 128,64,2,0 - KERNEL8x16_I1_L4_3 128,64,3,1 - blr - -L8: -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 3 - - ble LSGEMM_L8_END - -LSGEMM_L8_BEGIN: - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 3 - add C, C, T3 - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L8x16_END - - MY_ALIGN -LSGEMM_L8x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 - mr T12, T11 - addi T12,T12, -2 - srawi. L, T12, 7 /**(T11-2) % 128x */ -#else - mr T12, K - addi T12,T12, -2 - srawi. L, T12, 7 /**(K-2) % 128x */ -#endif - - ZERO8x16 - ble LSGEMM_L8x16_SUB0 - mtctr L - bl LSGEMM_L8x16_LMAIN_SUB - andi. L, T12, 127 - ble LSGEMM_L8x16_SAVE - b LSGEMM_L8x16_SUB2 - MY_ALIGN -LSGEMM_L8x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 255 - cmpwi T11,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T10,1 - bne CMP8x16_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD8x16 64,32 - END8x16_WITHOUT_ADD - LOAD8x16_2O AO,BO, 128, 64 - mtctr T10 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE -CMP8x16_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T11,128 -#else - cmpwi K,128 -#endif - bne LSGEMM_L8x16_SUB2 - MY_ALIGN - mtctr T10 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD8x16_2O AO,BO, 128,64 - bl LSGEMM_L8x16_K128 - b LSGEMM_L8x16_SAVE - MY_ALIGN -LSGEMM_L8x16_SUB2: - andi. T10,L,64 - ble LSGEMM_L8x16_SUB2_32 - bl LSGEMM_L8x16_L64_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_32: - andi. T10,L, 32 - ble LSGEMM_L8x16_SUB2_16 - bl LSGEMM_L8x16_L32_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L8x16_SUB2_8 - bl LSGEMM_L8x16_L16_SUB - MY_ALIGN -LSGEMM_L8x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L8x16_SUB2_4 - LOAD8x16_2 - KERNEL8x16_I1_L4_2 128,64, 0,0 - KERNEL8x16_I1_L4_3 128,64, 1,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L8x16_SUB2_2 - LOAD8x16_2 - KERNEL8x16_I1_L4_3 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L8x16_SUB2_1 - LOAD8x16_2 - KERNEL8x16_E2 128,64, 0,1 - MY_ALIGN -LSGEMM_L8x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L8x16_SAVE - KERNEL8x16 0 - - - MY_ALIGN -LSGEMM_L8x16_SAVE: - SAVE8x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L8x16_BEGIN - MY_ALIGN -LSGEMM_L8x16_END: -LSGEMM_L8x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 8 - ble LSGEMM_L8x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x8 - ble LSGEMM_L8x8_SUB0 - - MY_ALIGN -LSGEMM_L8x8_LOOP_START: - - LOAD8x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x8_LOOP: - - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_2 32,32, 1,0 - KERNEL8x8_I1_L4_2 32,32, 2,0 - KERNEL8x8_I1_L4_2 32,32, 3,1 - - bdnz LSGEMM_L8x8_LOOP - - MY_ALIGN -LSGEMM_L8x8_LOOP_END: - - END8x8 0, AO, BO, 32, 32 - - b LSGEMM_L8x8_SUB1 - MY_ALIGN -LSGEMM_L8x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x8_SUB2 - MY_ALIGN -LSGEMM_L8x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x8_SAVE - MY_ALIGN -LSGEMM_L8x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x8_SUB2_LOOP: - LOAD8x8_0 - KERNEL8x8_I1_L4_2 32,32, 0,0 - KERNEL8x8_I1_L4_3 32,32, 1,1 - bdnz LSGEMM_L8x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x8_SUB2_2 - LOAD8x8_0 - KERNEL8x8_I1_L4_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x8_SUB2_1 - LOAD8x8_0 - KERNEL8x8_I1_L2_3 32,32, 0,1 - MY_ALIGN -LSGEMM_L8x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x8_SAVE - KERNEL8x8 0 - - - MY_ALIGN -LSGEMM_L8x8_SAVE: - SAVE8x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 -#endif - MY_ALIGN -LSGEMM_L8x8_END: -LSGEMM_L8x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L8x1_END - - andi. T1, M, 4 - ble LSGEMM_L8x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO8x4 - ble LSGEMM_L8x4_SUB0 - - MY_ALIGN -LSGEMM_L8x4_LOOP_START: - - LOAD8x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L8x4_LOOP: - - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_2 16,32, 1,0 - KERNEL8x4_I1_L4_2 16,32, 2,0 - KERNEL8x4_I1_L4_2 16,32, 3,1 - - bdnz LSGEMM_L8x4_LOOP - - MY_ALIGN -LSGEMM_L8x4_LOOP_END: - - END8x4 0, AO, BO, 16, 32 - - b LSGEMM_L8x4_SUB1 - MY_ALIGN -LSGEMM_L8x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L8x4_SUB2 - MY_ALIGN -LSGEMM_L8x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L8x4_SAVE - MY_ALIGN -LSGEMM_L8x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L8x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L8x4_SUB2_LOOP: - LOAD8x4_0 - KERNEL8x4_I1_L4_2 16,32, 0,0 - KERNEL8x4_I1_L4_3 16,32, 1,1 - bdnz LSGEMM_L8x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L8x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L8x4_SUB2_2 - LOAD8x4_0 - KERNEL8x4_I1_L4_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x4_SUB2_1 - LOAD8x4_0 - KERNEL8x4_I1_L2_3 16,32, 0,1 - MY_ALIGN -LSGEMM_L8x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x4_SAVE - KERNEL8x4 0 - - - MY_ALIGN -LSGEMM_L8x4_SAVE: - SAVE8x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 -#endif - MY_ALIGN -LSGEMM_L8x4_END: -LSGEMM_L8x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L8x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x2 - ble LSGEMM_L8x2_SUB0 - - MY_ALIGN -LSGEMM_L8x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x2_LOOP: - - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,0 - KERNEL8x2_2 0,0, 2,0 - KERNEL8x2_2 0,0, 3,1 - - bdnz LSGEMM_L8x2_LOOP - - MY_ALIGN -LSGEMM_L8x2_LOOP_END: - -LSGEMM_L8x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x2_SAVE - MY_ALIGN -LSGEMM_L8x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x2_SUB2_2 - KERNEL8x2_2 0,0, 0,0 - KERNEL8x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x2_SUB2_1 - KERNEL8x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x2_SAVE - KERNEL8x2 - - MY_ALIGN -LSGEMM_L8x2_SAVE: - SAVE8x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 -#endif - MY_ALIGN -LSGEMM_L8x2_END: -LSGEMM_L8x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L8x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO8x1 - ble LSGEMM_L8x1_SUB0 - - MY_ALIGN -LSGEMM_L8x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L8x1_LOOP: - - KERNEL8x1_4 0,0, 0,0 - KERNEL8x1_4 0,0, 1,1 - - bdnz LSGEMM_L8x1_LOOP - - MY_ALIGN -LSGEMM_L8x1_LOOP_END: - -LSGEMM_L8x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L8x1_SAVE - MY_ALIGN -LSGEMM_L8x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L8x1_SUB2_2 - KERNEL8x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L8x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L8x1_SUB2_1 - KERNEL8x1_2 - MY_ALIGN -LSGEMM_L8x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L8x1_SAVE - KERNEL8x1 - - MY_ALIGN -LSGEMM_L8x1_SAVE: - SAVE8x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 -#endif - MY_ALIGN -LSGEMM_L8x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 8 -#endif - addic. J, J, -1 - bgt LSGEMM_L8_BEGIN - - -LSGEMM_L8_END: - -/* b LSGEMM_L4_BEGIN*/ - andi. T1, N, 4 - ble LSGEMM_L4_END -LSGEMM_L4_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L4x16_END - - MY_ALIGN -LSGEMM_L4x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 6 /**(T11-1) % 64x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 6 /**(K-1) % 64x */ -#endif - - ZERO4x16 - ble LSGEMM_L4x16_SUB0 - - MY_ALIGN -LSGEMM_L4x16_LOOP_START: - - LOAD4x16_0 /*we already zeroed */ - ##OffsetA=64 OffsetB=16 - addi AO,AO,2112 - addi BO,BO,16 - - mtctr L - - MY_ALIGN - -LSGEMM_L4x16_LOOP: - - KERNEL4x16_I1_L4_2 -2048,0, 0,0 - KERNEL4x16_I1_L4_2 -2048,0, 1,0 - KERNEL4x16_I1_L4_2 -2048,0, 2,0 - KERNEL4x16_I1_L4_2 -2048,0, 3,0 - KERNEL4x16_I1_L4_2 -2048,0, 4,0 - KERNEL4x16_I1_L4_2 -2048,0, 5,0 - KERNEL4x16_I1_L4_2 -2048,0, 6,0 - KERNEL4x16_I1_L4_2 -2048,0, 7,0 - KERNEL4x16_I1_L4_2 -2048,0, 8,0 - KERNEL4x16_I1_L4_2 -2048,0, 9,0 - KERNEL4x16_I1_L4_2 -2048,0, 10,0 - KERNEL4x16_I1_L4_2 -2048,0, 11,0 - KERNEL4x16_I1_L4_2 -2048,0, 12,0 - KERNEL4x16_I1_L4_2 -2048,0, 13,0 - KERNEL4x16_I1_L4_2 -2048,0, 14,0 - KERNEL4x16_I1_L4_2 -2048,0, 15,1 - - bdnz LSGEMM_L4x16_LOOP - - MY_ALIGN -LSGEMM_L4x16_LOOP_END: - - END4x16 0, AO, BO, -2048, 0 - - b LSGEMM_L4x16_SUB1 - MY_ALIGN -LSGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 127 -#else - andi. L, K, 127 -#endif - b LSGEMM_L4x16_SUB2 - MY_ALIGN -LSGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 63 -#else - andi. L, T12, 63 -#endif - ble LSGEMM_L4x16_SAVE - MY_ALIGN -LSGEMM_L4x16_SUB2: - - srawi. T10,L, 5 - ble LSGEMM_L4x16_SUB2_16 - mtctr T10 - MY_ALIGN -LSGEMM_L4x16_SUB2_LOOP: - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_2 64,16, 3,0 - KERNEL4x16_I1_L4_2 64,16, 4,0 - KERNEL4x16_I1_L4_2 64,16, 5,0 - KERNEL4x16_I1_L4_2 64,16, 6,0 - KERNEL4x16_I1_L4_3 64,16, 7,1 - bdnz LSGEMM_L4x16_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_2 64,16, 1,0 - KERNEL4x16_I1_L4_2 64,16, 2,0 - KERNEL4x16_I1_L4_3 64,16, 3,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L4_2 64,16, 0,0 - KERNEL4x16_I1_L4_3 64,16, 1,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L4_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 64,16, 0,1 - MY_ALIGN -LSGEMM_L4x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LSGEMM_L4x16_SUB2 - - MY_ALIGN -LSGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L4x16_BEGIN - MY_ALIGN -LSGEMM_L4x16_END: -LSGEMM_L4x8_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 8 - ble LSGEMM_L4x8_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x8 - ble LSGEMM_L4x8_SUB0 - - MY_ALIGN -LSGEMM_L4x8_LOOP_START: - - LOAD4x8_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_2 32,16, 1,0 - KERNEL4x8_I1_L4_2 32,16, 2,0 - KERNEL4x8_I1_L4_2 32,16, 3,1 - - bdnz LSGEMM_L4x8_LOOP - - MY_ALIGN -LSGEMM_L4x8_LOOP_END: - - END4x8 0, AO, BO, 32, 16 - - b LSGEMM_L4x8_SUB1 - MY_ALIGN -LSGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x8_SUB2 - MY_ALIGN -LSGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x8_SAVE - MY_ALIGN -LSGEMM_L4x8_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x8_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x8_SUB2_LOOP: - LOAD4x8_0 - KERNEL4x8_I1_L4_2 32,16, 0,0 - KERNEL4x8_I1_L4_3 32,16, 1,1 - bdnz LSGEMM_L4x8_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L4_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 32,16, 0,1 - MY_ALIGN -LSGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x8_SAVE - KERNEL4x8 0 - - - MY_ALIGN -LSGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 -#endif - MY_ALIGN -LSGEMM_L4x8_END: -LSGEMM_L4x4_BEGIN: - andi. T2, M, 15 - ble LSGEMM_L4x1_END - - andi. T1, M, 4 - ble LSGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 - mr T12, T11 - addi T12,T12, -1 - srawi. L, T12, 4 /**(T11-1) % 16x */ -#else - mr T12, K - addi T12,T12, -1 - srawi. L, T12, 4 /**(K-1) % 16x */ -#endif - - ZERO4x4 - ble LSGEMM_L4x4_SUB0 - - MY_ALIGN -LSGEMM_L4x4_LOOP_START: - - LOAD4x4_0 /*we already zeroed */ - mtctr L - - MY_ALIGN - -LSGEMM_L4x4_LOOP: - - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_2 16,16, 1,0 - KERNEL4x4_I1_L4_2 16,16, 2,0 - KERNEL4x4_I1_L4_2 16,16, 3,1 - - bdnz LSGEMM_L4x4_LOOP - - MY_ALIGN -LSGEMM_L4x4_LOOP_END: - - END4x4 0, AO, BO, 16, 16 - - b LSGEMM_L4x4_SUB1 - MY_ALIGN -LSGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 31 -#else - andi. L, K, 31 -#endif - b LSGEMM_L4x4_SUB2 - MY_ALIGN -LSGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T12, 15 -#else - andi. L, T12, 15 -#endif - ble LSGEMM_L4x4_SAVE - MY_ALIGN -LSGEMM_L4x4_SUB2: - - srawi. T1,L, 3 - ble LSGEMM_L4x4_SUB2_4 - mtctr T1 - MY_ALIGN -LSGEMM_L4x4_SUB2_LOOP: - LOAD4x4_0 - KERNEL4x4_I1_L4_2 16,16, 0,0 - KERNEL4x4_I1_L4_3 16,16, 1,1 - bdnz LSGEMM_L4x4_SUB2_LOOP - MY_ALIGN -LSGEMM_L4x4_SUB2_4: - andi. T1,L, 4 - ble LSGEMM_L4x4_SUB2_2 - LOAD4x4_0 - KERNEL4x4_I1_L4_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x4_SUB2_1 - LOAD4x4_0 - KERNEL4x4_I1_L2_3 16,16, 0,1 - MY_ALIGN -LSGEMM_L4x4_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x4_SAVE - KERNEL4x4 0 - - - MY_ALIGN -LSGEMM_L4x4_SAVE: - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 -#endif - MY_ALIGN -LSGEMM_L4x4_END: -LSGEMM_L4x2_BEGIN: - andi. T1, M, 2 - ble LSGEMM_L4x2_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x2 - ble LSGEMM_L4x2_SUB0 - - MY_ALIGN -LSGEMM_L4x2_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x2_LOOP: - - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,0 - KERNEL4x2_2 0,0, 2,0 - KERNEL4x2_2 0,0, 3,1 - - bdnz LSGEMM_L4x2_LOOP - - MY_ALIGN -LSGEMM_L4x2_LOOP_END: - -LSGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x2_SAVE - MY_ALIGN -LSGEMM_L4x2_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x2_SUB2_2 - KERNEL4x2_2 0,0, 0,0 - KERNEL4x2_2 0,0, 1,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x2_SUB2_1 - KERNEL4x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x2_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -LSGEMM_L4x2_SAVE: - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 -#endif - MY_ALIGN -LSGEMM_L4x2_END: -LSGEMM_L4x1_BEGIN: - andi. T1, M, 1 - ble LSGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 - srawi. L, T11, 3 /**(T11) % 8x */ -#else - srawi. L, K, 3 /**(K) % 8x */ -#endif - - ZERO4x1 - ble LSGEMM_L4x1_SUB0 - - MY_ALIGN -LSGEMM_L4x1_LOOP_START: - mtctr L - - MY_ALIGN - -LSGEMM_L4x1_LOOP: - - KERNEL4x1_4 0,0, 0,0 - KERNEL4x1_4 0,0, 1,1 - - bdnz LSGEMM_L4x1_LOOP - - MY_ALIGN -LSGEMM_L4x1_LOOP_END: - -LSGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 7 -#else - andi. L, K, 7 -#endif - ble LSGEMM_L4x1_SAVE - MY_ALIGN -LSGEMM_L4x1_SUB2: - andi. T1,L, 4 - ble LSGEMM_L4x1_SUB2_2 - KERNEL4x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L4x1_SUB2_2: - andi. T1,L, 2 - ble LSGEMM_L4x1_SUB2_1 - KERNEL4x1_2 - MY_ALIGN -LSGEMM_L4x1_SUB2_1: - andi. T1,L, 1 - ble LSGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -LSGEMM_L4x1_SAVE: - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 -#endif - MY_ALIGN -LSGEMM_L4x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - - andi. T2, N, 3 - ble .L999 - -LSGEMM_L4_END: - andi. T1, N, 2 - ble LSGEMM_L2_END -LSGEMM_L2_BEGIN: - - - mr AO, A - mr CO, C - slwi T3, LDC , 1 - add C, C, T3 - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_L2x16_END - - MY_ALIGN -LSGEMM_L2x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x16 - ble LSGEMM_L2x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x16_LOOP: - - KERNEL2x16_4 -2048,0, 0,0 - KERNEL2x16_4 -2048,0, 1,0 - KERNEL2x16_4 -2048,0, 2,0 - KERNEL2x16_4 -2048,0, 3,0 - KERNEL2x16_4 -2048,0, 4,0 - KERNEL2x16_4 -2048,0, 5,0 - KERNEL2x16_4 -2048,0, 6,0 - KERNEL2x16_4 -2048,0, 7,0 - KERNEL2x16_4 -2048,0, 8,0 - KERNEL2x16_4 -2048,0, 9,0 - KERNEL2x16_4 -2048,0, 10,0 - KERNEL2x16_4 -2048,0, 11,0 - KERNEL2x16_4 -2048,0, 12,0 - KERNEL2x16_4 -2048,0, 13,0 - KERNEL2x16_4 -2048,0, 14,0 - KERNEL2x16_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x16_SAVE - MY_ALIGN -LSGEMM_L2x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x16_SUB2_16 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,0 - KERNEL2x16_4 0,0, 4,0 - KERNEL2x16_4 0,0, 5,0 - KERNEL2x16_4 0,0, 6,0 - KERNEL2x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x16_SUB2_8 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,0 - KERNEL2x16_4 0,0, 2,0 - KERNEL2x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x16_SUB2_4 - KERNEL2x16_4 0,0, 0,0 - KERNEL2x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x16_SUB2_2 - KERNEL2x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x16_SUB2_1 - KERNEL2x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x16_SAVE - KERNEL2x16 - - MY_ALIGN -LSGEMM_L2x16_SAVE: - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt+ LSGEMM_L2x16_BEGIN - MY_ALIGN -LSGEMM_L2x16_END: - andi. I, M, 8 - ble LSGEMM_L2x8_END - - MY_ALIGN -LSGEMM_L2x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x8 - ble LSGEMM_L2x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_L2x8_LOOP: - - KERNEL2x8_4 -2048,0, 0,0 - KERNEL2x8_4 -2048,0, 1,0 - KERNEL2x8_4 -2048,0, 2,0 - KERNEL2x8_4 -2048,0, 3,0 - KERNEL2x8_4 -2048,0, 4,0 - KERNEL2x8_4 -2048,0, 5,0 - KERNEL2x8_4 -2048,0, 6,0 - KERNEL2x8_4 -2048,0, 7,0 - KERNEL2x8_4 -2048,0, 8,0 - KERNEL2x8_4 -2048,0, 9,0 - KERNEL2x8_4 -2048,0, 10,0 - KERNEL2x8_4 -2048,0, 11,0 - KERNEL2x8_4 -2048,0, 12,0 - KERNEL2x8_4 -2048,0, 13,0 - KERNEL2x8_4 -2048,0, 14,0 - KERNEL2x8_4 -2048,0, 15,1 - - bdnz LSGEMM_L2x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x8_SAVE - MY_ALIGN -LSGEMM_L2x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x8_SUB2_16 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,0 - KERNEL2x8_4 0,0, 4,0 - KERNEL2x8_4 0,0, 5,0 - KERNEL2x8_4 0,0, 6,0 - KERNEL2x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x8_SUB2_8 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,0 - KERNEL2x8_4 0,0, 2,0 - KERNEL2x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x8_SUB2_4 - KERNEL2x8_4 0,0, 0,0 - KERNEL2x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x8_SUB2_2 - KERNEL2x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x8_SUB2_1 - KERNEL2x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -LSGEMM_L2x8_SAVE: - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 -#endif - MY_ALIGN -LSGEMM_L2x8_END: - andi. I, M, 4 - ble LSGEMM_L2x4_END - - MY_ALIGN -LSGEMM_L2x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x4 - ble LSGEMM_L2x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x4_LOOP: - - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,0 - KERNEL2x4_4 0,0, 8,0 - KERNEL2x4_4 0,0, 9,0 - KERNEL2x4_4 0,0, 10,0 - KERNEL2x4_4 0,0, 11,0 - KERNEL2x4_4 0,0, 12,0 - KERNEL2x4_4 0,0, 13,0 - KERNEL2x4_4 0,0, 14,0 - KERNEL2x4_4 0,0, 15,1 - - bdnz LSGEMM_L2x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x4_SAVE - MY_ALIGN -LSGEMM_L2x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x4_SUB2_16 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,0 - KERNEL2x4_4 0,0, 4,0 - KERNEL2x4_4 0,0, 5,0 - KERNEL2x4_4 0,0, 6,0 - KERNEL2x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x4_SUB2_8 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,0 - KERNEL2x4_4 0,0, 2,0 - KERNEL2x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x4_SUB2_4 - KERNEL2x4_4 0,0, 0,0 - KERNEL2x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x4_SUB2_2 - KERNEL2x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x4_SUB2_1 - KERNEL2x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x4_SAVE - KERNEL2x4 - - MY_ALIGN -LSGEMM_L2x4_SAVE: - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 -#endif - MY_ALIGN -LSGEMM_L2x4_END: - andi. I, M, 2 - ble LSGEMM_L2x2_END - - MY_ALIGN -LSGEMM_L2x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x2 - ble LSGEMM_L2x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x2_LOOP: - - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,0 - KERNEL2x2_4 0,0, 8,0 - KERNEL2x2_4 0,0, 9,0 - KERNEL2x2_4 0,0, 10,0 - KERNEL2x2_4 0,0, 11,0 - KERNEL2x2_4 0,0, 12,0 - KERNEL2x2_4 0,0, 13,0 - KERNEL2x2_4 0,0, 14,0 - KERNEL2x2_4 0,0, 15,1 - - bdnz LSGEMM_L2x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x2_SAVE - MY_ALIGN -LSGEMM_L2x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x2_SUB2_16 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,0 - KERNEL2x2_4 0,0, 4,0 - KERNEL2x2_4 0,0, 5,0 - KERNEL2x2_4 0,0, 6,0 - KERNEL2x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x2_SUB2_8 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,0 - KERNEL2x2_4 0,0, 2,0 - KERNEL2x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x2_SUB2_4 - KERNEL2x2_4 0,0, 0,0 - KERNEL2x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x2_SUB2_2 - KERNEL2x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x2_SUB2_1 - KERNEL2x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -LSGEMM_L2x2_SAVE: - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 -#endif - MY_ALIGN -LSGEMM_L2x2_END: - andi. I, M, 1 - ble LSGEMM_L2x1_END - - MY_ALIGN -LSGEMM_L2x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO2x1 - ble LSGEMM_L2x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_L2x1_LOOP: - - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,0 - KERNEL2x1_4 0,0, 8,0 - KERNEL2x1_4 0,0, 9,0 - KERNEL2x1_4 0,0, 10,0 - KERNEL2x1_4 0,0, 11,0 - KERNEL2x1_4 0,0, 12,0 - KERNEL2x1_4 0,0, 13,0 - KERNEL2x1_4 0,0, 14,0 - KERNEL2x1_4 0,0, 15,1 - - bdnz LSGEMM_L2x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_L2x1_SAVE - MY_ALIGN -LSGEMM_L2x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_L2x1_SUB2_16 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,0 - KERNEL2x1_4 0,0, 4,0 - KERNEL2x1_4 0,0, 5,0 - KERNEL2x1_4 0,0, 6,0 - KERNEL2x1_4 0,0, 7,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_L2x1_SUB2_8 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,0 - KERNEL2x1_4 0,0, 2,0 - KERNEL2x1_4 0,0, 3,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_L2x1_SUB2_4 - KERNEL2x1_4 0,0, 0,0 - KERNEL2x1_4 0,0, 1,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_L2x1_SUB2_2 - KERNEL2x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_L2x1_SUB2_1 - KERNEL2x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_L2x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -LSGEMM_L2x1_SAVE: - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 -#endif - MY_ALIGN -LSGEMM_L2x1_END: - slwi T1, K, 3 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LSGEMM_L2_END: - andi. T1, N, 1 - ble LSGEMM_END -LSGEMM_1_BEGIN: - - - mr AO, A - mr CO, C - add C, C, LDC - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LSGEMM_1x16_END - - MY_ALIGN -LSGEMM_1x16_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x16 - ble LSGEMM_1x16_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x16_LOOP: - - KERNEL1x16_4 -2048,0, 0,0 - KERNEL1x16_4 -2048,0, 1,0 - KERNEL1x16_4 -2048,0, 2,0 - KERNEL1x16_4 -2048,0, 3,0 - KERNEL1x16_4 -2048,0, 4,0 - KERNEL1x16_4 -2048,0, 5,0 - KERNEL1x16_4 -2048,0, 6,0 - KERNEL1x16_4 -2048,0, 7,0 - KERNEL1x16_4 -2048,0, 8,0 - KERNEL1x16_4 -2048,0, 9,0 - KERNEL1x16_4 -2048,0, 10,0 - KERNEL1x16_4 -2048,0, 11,0 - KERNEL1x16_4 -2048,0, 12,0 - KERNEL1x16_4 -2048,0, 13,0 - KERNEL1x16_4 -2048,0, 14,0 - KERNEL1x16_4 -2048,0, 15,1 - - bdnz LSGEMM_1x16_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x16_SAVE - MY_ALIGN -LSGEMM_1x16_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x16_SUB2_16 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,0 - KERNEL1x16_4 0,0, 4,0 - KERNEL1x16_4 0,0, 5,0 - KERNEL1x16_4 0,0, 6,0 - KERNEL1x16_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x16_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x16_SUB2_8 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,0 - KERNEL1x16_4 0,0, 2,0 - KERNEL1x16_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x16_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x16_SUB2_4 - KERNEL1x16_4 0,0, 0,0 - KERNEL1x16_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x16_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x16_SUB2_2 - KERNEL1x16_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x16_SUB2_1 - KERNEL1x16_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x16_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x16_SAVE - KERNEL1x16 - - MY_ALIGN -LSGEMM_1x16_SAVE: - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt+ LSGEMM_1x16_BEGIN - MY_ALIGN -LSGEMM_1x16_END: - andi. I, M, 8 - ble LSGEMM_1x8_END - - MY_ALIGN -LSGEMM_1x8_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x8 - ble LSGEMM_1x8_SUB0 - addi AO,AO,2048 - - mtctr L - - MY_ALIGN - -LSGEMM_1x8_LOOP: - - KERNEL1x8_4 -2048,0, 0,0 - KERNEL1x8_4 -2048,0, 1,0 - KERNEL1x8_4 -2048,0, 2,0 - KERNEL1x8_4 -2048,0, 3,0 - KERNEL1x8_4 -2048,0, 4,0 - KERNEL1x8_4 -2048,0, 5,0 - KERNEL1x8_4 -2048,0, 6,0 - KERNEL1x8_4 -2048,0, 7,0 - KERNEL1x8_4 -2048,0, 8,0 - KERNEL1x8_4 -2048,0, 9,0 - KERNEL1x8_4 -2048,0, 10,0 - KERNEL1x8_4 -2048,0, 11,0 - KERNEL1x8_4 -2048,0, 12,0 - KERNEL1x8_4 -2048,0, 13,0 - KERNEL1x8_4 -2048,0, 14,0 - KERNEL1x8_4 -2048,0, 15,1 - - bdnz LSGEMM_1x8_LOOP - MY_ALIGN - addi AO,AO, -2048 - MY_ALIGN -LSGEMM_1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x8_SAVE - MY_ALIGN -LSGEMM_1x8_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x8_SUB2_16 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,0 - KERNEL1x8_4 0,0, 4,0 - KERNEL1x8_4 0,0, 5,0 - KERNEL1x8_4 0,0, 6,0 - KERNEL1x8_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x8_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x8_SUB2_8 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,0 - KERNEL1x8_4 0,0, 2,0 - KERNEL1x8_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x8_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x8_SUB2_4 - KERNEL1x8_4 0,0, 0,0 - KERNEL1x8_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x8_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x8_SUB2_2 - KERNEL1x8_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x8_SUB2_1 - KERNEL1x8_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x8_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x8_SAVE - KERNEL1x8 - - MY_ALIGN -LSGEMM_1x8_SAVE: - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 -#endif - MY_ALIGN -LSGEMM_1x8_END: - andi. I, M, 4 - ble LSGEMM_1x4_END - - MY_ALIGN -LSGEMM_1x4_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x4 - ble LSGEMM_1x4_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x4_LOOP: - - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,0 - KERNEL1x4_4 0,0, 8,0 - KERNEL1x4_4 0,0, 9,0 - KERNEL1x4_4 0,0, 10,0 - KERNEL1x4_4 0,0, 11,0 - KERNEL1x4_4 0,0, 12,0 - KERNEL1x4_4 0,0, 13,0 - KERNEL1x4_4 0,0, 14,0 - KERNEL1x4_4 0,0, 15,1 - - bdnz LSGEMM_1x4_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x4_SAVE - MY_ALIGN -LSGEMM_1x4_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x4_SUB2_16 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,0 - KERNEL1x4_4 0,0, 4,0 - KERNEL1x4_4 0,0, 5,0 - KERNEL1x4_4 0,0, 6,0 - KERNEL1x4_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x4_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x4_SUB2_8 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,0 - KERNEL1x4_4 0,0, 2,0 - KERNEL1x4_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x4_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x4_SUB2_4 - KERNEL1x4_4 0,0, 0,0 - KERNEL1x4_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x4_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x4_SUB2_2 - KERNEL1x4_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x4_SUB2_1 - KERNEL1x4_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x4_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x4_SAVE - KERNEL1x4 - - MY_ALIGN -LSGEMM_1x4_SAVE: - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 -#endif - MY_ALIGN -LSGEMM_1x4_END: - andi. I, M, 2 - ble LSGEMM_1x2_END - - MY_ALIGN -LSGEMM_1x2_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x2 - ble LSGEMM_1x2_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x2_LOOP: - - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,0 - KERNEL1x2_4 0,0, 8,0 - KERNEL1x2_4 0,0, 9,0 - KERNEL1x2_4 0,0, 10,0 - KERNEL1x2_4 0,0, 11,0 - KERNEL1x2_4 0,0, 12,0 - KERNEL1x2_4 0,0, 13,0 - KERNEL1x2_4 0,0, 14,0 - KERNEL1x2_4 0,0, 15,1 - - bdnz LSGEMM_1x2_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x2_SAVE - MY_ALIGN -LSGEMM_1x2_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x2_SUB2_16 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,0 - KERNEL1x2_4 0,0, 4,0 - KERNEL1x2_4 0,0, 5,0 - KERNEL1x2_4 0,0, 6,0 - KERNEL1x2_4 0,0, 7,1 - MY_ALIGN -LSGEMM_1x2_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x2_SUB2_8 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,0 - KERNEL1x2_4 0,0, 2,0 - KERNEL1x2_4 0,0, 3,1 - MY_ALIGN -LSGEMM_1x2_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x2_SUB2_4 - KERNEL1x2_4 0,0, 0,0 - KERNEL1x2_4 0,0, 1,1 - MY_ALIGN -LSGEMM_1x2_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x2_SUB2_2 - KERNEL1x2_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x2_SUB2_1 - KERNEL1x2_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x2_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x2_SAVE - KERNEL1x2 - - MY_ALIGN -LSGEMM_1x2_SAVE: - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 -#endif - MY_ALIGN -LSGEMM_1x2_END: - andi. I, M, 1 - ble LSGEMM_1x1_END - - MY_ALIGN -LSGEMM_1x1_BEGIN: - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 - srawi. L, T11, 6 /**(T11 ) % 64x */ -#else - srawi. L, K, 6 /**(K ) % 64x */ -#endif - - ZERO1x1 - ble LSGEMM_1x1_SUB0 - - - mtctr L - - MY_ALIGN - -LSGEMM_1x1_LOOP: - - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,0 - KERNEL1x1_16 0,0, 2,0 - KERNEL1x1_16 0,0, 3,1 - - bdnz LSGEMM_1x1_LOOP - MY_ALIGN - - MY_ALIGN -LSGEMM_1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T11, 63 -#else - andi. L, K, 63 -#endif - ble LSGEMM_1x1_SAVE - MY_ALIGN -LSGEMM_1x1_SUB2: - andi. T10,L, 32 - ble LSGEMM_1x1_SUB2_16 - KERNEL1x1_16 0,0, 0,0 - KERNEL1x1_16 0,0, 1,1 - MY_ALIGN -LSGEMM_1x1_SUB2_16: - andi. T10,L, 16 - ble LSGEMM_1x1_SUB2_8 - KERNEL1x1_16 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_8: - andi. T10,L, 8 - ble LSGEMM_1x1_SUB2_4 - KERNEL1x1_8 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_4: - andi. T10,L, 4 - ble LSGEMM_1x1_SUB2_2 - KERNEL1x1_4 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_2: - andi. T10,L, 2 - ble LSGEMM_1x1_SUB2_1 - KERNEL1x1_2 0,0, 0,1 - MY_ALIGN -LSGEMM_1x1_SUB2_1: - andi. T10,L, 1 - ble LSGEMM_1x1_SAVE - KERNEL1x1 - - MY_ALIGN -LSGEMM_1x1_SAVE: - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 -#endif - MY_ALIGN -LSGEMM_1x1_END: - slwi T1, K, 2 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif +#define MY_ALIGN .align 3 +b L8 + + MY_ALIGN +LSGEMM_L8x16_LMAIN_SUB: + LOAD8x16_2 + MY_ALIGN + +LSGEMM_L8x16_LOOP: + KERNEL8x16_L2 128,64,0,0 +LSGEMM_L8x16_K128: + KERNEL8x16_L2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64, 3,0 + KERNEL8x16_I1_L4_2 128,64, 4,0 + KERNEL8x16_I1_L4_2 128,64, 5,0 + KERNEL8x16_I1_L4_2 128,64, 6,0 + KERNEL8x16_I1_L4_2 128,64, 7,0 + KERNEL8x16_I1_L4_2 128,64, 8,0 + KERNEL8x16_I1_L4_2 128,64, 9,0 + KERNEL8x16_I1_L4_2 128,64, 10,0 + KERNEL8x16_I1_L4_2 128,64, 11,0 + KERNEL8x16_I1_L4_2 128,64, 12,0 + KERNEL8x16_I1_L4_2 128,64, 13,0 + KERNEL8x16_I1_L4_2 128,64, 14,0 + KERNEL8x16_I1_L4_2 128,64, 15,0 + KERNEL8x16_I1_L4_2 128,64, 16,0 + KERNEL8x16_I1_L4_2 128,64, 17,0 + KERNEL8x16_I1_L4_2 128,64, 18,0 + KERNEL8x16_I1_L4_2 128,64, 19,0 + KERNEL8x16_I1_L4_2 128,64, 20,0 + KERNEL8x16_I1_L4_2 128,64, 21,0 + KERNEL8x16_I1_L4_2 128,64, 22,0 + KERNEL8x16_I1_L4_2 128,64, 23,0 + KERNEL8x16_I1_L4_2 128,64, 24,0 + KERNEL8x16_I1_L4_2 128,64, 25,0 + KERNEL8x16_I1_L4_2 128,64, 26,0 + KERNEL8x16_I1_L4_2 128,64, 27,0 + KERNEL8x16_I1_L4_2 128,64, 28,0 + KERNEL8x16_I1_L4_2 128,64, 29,0 + KERNEL8x16_I1_L4_2 128,64, 30,0 + KERNEL8x16_I1_L4_2 128,64, 31,1 + bdnz LSGEMM_L8x16_LOOP + + MY_ALIGN +LSGEMM_L8x16_LOOP_END: + END8x16_2 + blr + + MY_ALIGN +LSGEMM_L8x16_L64_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_2 128,64, 1,0 + KERNEL8x16_I1_L4_2 128,64, 2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_2 128,64,7,0 + KERNEL8x16_I1_L4_2 128,64,8,0 + KERNEL8x16_I1_L4_2 128,64,9,0 + KERNEL8x16_I1_L4_2 128,64,10,0 + KERNEL8x16_I1_L4_2 128,64,11,0 + KERNEL8x16_I1_L4_2 128,64,12,0 + KERNEL8x16_I1_L4_2 128,64,13,0 + KERNEL8x16_I1_L4_2 128,64,14,0 + KERNEL8x16_I1_L4_3 128,64,15,1 + blr +LSGEMM_L8x16_L32_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_2 128,64,3,0 + KERNEL8x16_I1_L4_2 128,64,4,0 + KERNEL8x16_I1_L4_2 128,64,5,0 + KERNEL8x16_I1_L4_2 128,64,6,0 + KERNEL8x16_I1_L4_3 128,64,7,1 + blr + +LSGEMM_L8x16_L16_SUB: + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64,0,0 + KERNEL8x16_I1_L4_2 128,64,1,0 + KERNEL8x16_I1_L4_2 128,64,2,0 + KERNEL8x16_I1_L4_3 128,64,3,1 + blr + +L8: +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + + srawi. J, N, 3 + + ble LSGEMM_L8_END + +LSGEMM_L8_BEGIN: + + li T1, 128 + li T2, 256 + + mr AO, A + mr CO, C + slwi T3, LDC , 3 + add C, C, T3 + + dcbt A, T1 + dcbt A, T2 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L8x16_END + + MY_ALIGN +LSGEMM_L8x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,8 + mr T12, T11 + addi T12,T12, -2 + srawi. L, T12, 7 /**(T11-2) % 128x */ +#else + mr T12, K + addi T12,T12, -2 + srawi. L, T12, 7 /**(K-2) % 128x */ +#endif + + ZERO8x16 + ble LSGEMM_L8x16_SUB0 + mtctr L + bl LSGEMM_L8x16_LMAIN_SUB + andi. L, T12, 127 + ble LSGEMM_L8x16_SAVE + b LSGEMM_L8x16_SUB2 + MY_ALIGN +LSGEMM_L8x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 255 + cmpwi T11,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T10,1 + bne CMP8x16_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD8x16 64,32 + END8x16_WITHOUT_ADD + LOAD8x16_2O AO,BO, 128, 64 + mtctr T10 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE +CMP8x16_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T11,128 +#else + cmpwi K,128 +#endif + bne LSGEMM_L8x16_SUB2 + MY_ALIGN + mtctr T10 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD8x16_2O AO,BO, 128,64 + bl LSGEMM_L8x16_K128 + b LSGEMM_L8x16_SAVE + MY_ALIGN +LSGEMM_L8x16_SUB2: + andi. T10,L,64 + ble LSGEMM_L8x16_SUB2_32 + bl LSGEMM_L8x16_L64_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_32: + andi. T10,L, 32 + ble LSGEMM_L8x16_SUB2_16 + bl LSGEMM_L8x16_L32_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L8x16_SUB2_8 + bl LSGEMM_L8x16_L16_SUB + MY_ALIGN +LSGEMM_L8x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L8x16_SUB2_4 + LOAD8x16_2 + KERNEL8x16_I1_L4_2 128,64, 0,0 + KERNEL8x16_I1_L4_3 128,64, 1,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L8x16_SUB2_2 + LOAD8x16_2 + KERNEL8x16_I1_L4_3 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L8x16_SUB2_1 + LOAD8x16_2 + KERNEL8x16_E2 128,64, 0,1 + MY_ALIGN +LSGEMM_L8x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L8x16_SAVE + KERNEL8x16 0 + + + MY_ALIGN +LSGEMM_L8x16_SAVE: + SAVE8x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L8x16_BEGIN + MY_ALIGN +LSGEMM_L8x16_END: +LSGEMM_L8x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 8 + ble LSGEMM_L8x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x8 + ble LSGEMM_L8x8_SUB0 + + MY_ALIGN +LSGEMM_L8x8_LOOP_START: + + LOAD8x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x8_LOOP: + + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_2 32,32, 1,0 + KERNEL8x8_I1_L4_2 32,32, 2,0 + KERNEL8x8_I1_L4_2 32,32, 3,1 + + bdnz LSGEMM_L8x8_LOOP + + MY_ALIGN +LSGEMM_L8x8_LOOP_END: + + END8x8 0, AO, BO, 32, 32 + + b LSGEMM_L8x8_SUB1 + MY_ALIGN +LSGEMM_L8x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x8_SUB2 + MY_ALIGN +LSGEMM_L8x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x8_SAVE + MY_ALIGN +LSGEMM_L8x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x8_SUB2_LOOP: + LOAD8x8_0 + KERNEL8x8_I1_L4_2 32,32, 0,0 + KERNEL8x8_I1_L4_3 32,32, 1,1 + bdnz LSGEMM_L8x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x8_SUB2_2 + LOAD8x8_0 + KERNEL8x8_I1_L4_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x8_SUB2_1 + LOAD8x8_0 + KERNEL8x8_I1_L2_3 32,32, 0,1 + MY_ALIGN +LSGEMM_L8x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x8_SAVE + KERNEL8x8 0 + + + MY_ALIGN +LSGEMM_L8x8_SAVE: + SAVE8x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8 +#endif + MY_ALIGN +LSGEMM_L8x8_END: +LSGEMM_L8x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L8x1_END + + andi. T1, M, 4 + ble LSGEMM_L8x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,8 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO8x4 + ble LSGEMM_L8x4_SUB0 + + MY_ALIGN +LSGEMM_L8x4_LOOP_START: + + LOAD8x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L8x4_LOOP: + + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_2 16,32, 1,0 + KERNEL8x4_I1_L4_2 16,32, 2,0 + KERNEL8x4_I1_L4_2 16,32, 3,1 + + bdnz LSGEMM_L8x4_LOOP + + MY_ALIGN +LSGEMM_L8x4_LOOP_END: + + END8x4 0, AO, BO, 16, 32 + + b LSGEMM_L8x4_SUB1 + MY_ALIGN +LSGEMM_L8x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L8x4_SUB2 + MY_ALIGN +LSGEMM_L8x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L8x4_SAVE + MY_ALIGN +LSGEMM_L8x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L8x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L8x4_SUB2_LOOP: + LOAD8x4_0 + KERNEL8x4_I1_L4_2 16,32, 0,0 + KERNEL8x4_I1_L4_3 16,32, 1,1 + bdnz LSGEMM_L8x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L8x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L8x4_SUB2_2 + LOAD8x4_0 + KERNEL8x4_I1_L4_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x4_SUB2_1 + LOAD8x4_0 + KERNEL8x4_I1_L2_3 16,32, 0,1 + MY_ALIGN +LSGEMM_L8x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x4_SAVE + KERNEL8x4 0 + + + MY_ALIGN +LSGEMM_L8x4_SAVE: + SAVE8x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8 +#endif + MY_ALIGN +LSGEMM_L8x4_END: +LSGEMM_L8x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L8x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x2 + ble LSGEMM_L8x2_SUB0 + + MY_ALIGN +LSGEMM_L8x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x2_LOOP: + + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,0 + KERNEL8x2_2 0,0, 2,0 + KERNEL8x2_2 0,0, 3,1 + + bdnz LSGEMM_L8x2_LOOP + + MY_ALIGN +LSGEMM_L8x2_LOOP_END: + +LSGEMM_L8x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x2_SAVE + MY_ALIGN +LSGEMM_L8x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x2_SUB2_2 + KERNEL8x2_2 0,0, 0,0 + KERNEL8x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x2_SUB2_1 + KERNEL8x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x2_SAVE + KERNEL8x2 + + MY_ALIGN +LSGEMM_L8x2_SAVE: + SAVE8x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8 +#endif + MY_ALIGN +LSGEMM_L8x2_END: +LSGEMM_L8x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L8x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO8x1 + ble LSGEMM_L8x1_SUB0 + + MY_ALIGN +LSGEMM_L8x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L8x1_LOOP: + + KERNEL8x1_4 0,0, 0,0 + KERNEL8x1_4 0,0, 1,1 + + bdnz LSGEMM_L8x1_LOOP + + MY_ALIGN +LSGEMM_L8x1_LOOP_END: + +LSGEMM_L8x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L8x1_SAVE + MY_ALIGN +LSGEMM_L8x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L8x1_SUB2_2 + KERNEL8x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L8x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L8x1_SUB2_1 + KERNEL8x1_2 + MY_ALIGN +LSGEMM_L8x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L8x1_SAVE + KERNEL8x1 + + MY_ALIGN +LSGEMM_L8x1_SAVE: + SAVE8x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8 +#endif + MY_ALIGN +LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 8 +#endif + addic. J, J, -1 + bgt LSGEMM_L8_BEGIN + + +LSGEMM_L8_END: + +/* b LSGEMM_L4_BEGIN*/ + andi. T1, N, 4 + ble LSGEMM_L4_END +LSGEMM_L4_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 2 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L4x16_END + + MY_ALIGN +LSGEMM_L4x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 6 /**(T11-1) % 64x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 6 /**(K-1) % 64x */ +#endif + + ZERO4x16 + ble LSGEMM_L4x16_SUB0 + + MY_ALIGN +LSGEMM_L4x16_LOOP_START: + + LOAD4x16_0 /*we already zeroed */ + ##OffsetA=64 OffsetB=16 + addi AO,AO,2112 + addi BO,BO,16 + + mtctr L + + MY_ALIGN + +LSGEMM_L4x16_LOOP: + + KERNEL4x16_I1_L4_2 -2048,0, 0,0 + KERNEL4x16_I1_L4_2 -2048,0, 1,0 + KERNEL4x16_I1_L4_2 -2048,0, 2,0 + KERNEL4x16_I1_L4_2 -2048,0, 3,0 + KERNEL4x16_I1_L4_2 -2048,0, 4,0 + KERNEL4x16_I1_L4_2 -2048,0, 5,0 + KERNEL4x16_I1_L4_2 -2048,0, 6,0 + KERNEL4x16_I1_L4_2 -2048,0, 7,0 + KERNEL4x16_I1_L4_2 -2048,0, 8,0 + KERNEL4x16_I1_L4_2 -2048,0, 9,0 + KERNEL4x16_I1_L4_2 -2048,0, 10,0 + KERNEL4x16_I1_L4_2 -2048,0, 11,0 + KERNEL4x16_I1_L4_2 -2048,0, 12,0 + KERNEL4x16_I1_L4_2 -2048,0, 13,0 + KERNEL4x16_I1_L4_2 -2048,0, 14,0 + KERNEL4x16_I1_L4_2 -2048,0, 15,1 + + bdnz LSGEMM_L4x16_LOOP + + MY_ALIGN +LSGEMM_L4x16_LOOP_END: + + END4x16 0, AO, BO, -2048, 0 + + b LSGEMM_L4x16_SUB1 + MY_ALIGN +LSGEMM_L4x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 127 +#else + andi. L, K, 127 +#endif + b LSGEMM_L4x16_SUB2 + MY_ALIGN +LSGEMM_L4x16_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 63 +#else + andi. L, T12, 63 +#endif + ble LSGEMM_L4x16_SAVE + MY_ALIGN +LSGEMM_L4x16_SUB2: + + srawi. T10,L, 5 + ble LSGEMM_L4x16_SUB2_16 + mtctr T10 + MY_ALIGN +LSGEMM_L4x16_SUB2_LOOP: + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_2 64,16, 3,0 + KERNEL4x16_I1_L4_2 64,16, 4,0 + KERNEL4x16_I1_L4_2 64,16, 5,0 + KERNEL4x16_I1_L4_2 64,16, 6,0 + KERNEL4x16_I1_L4_3 64,16, 7,1 + bdnz LSGEMM_L4x16_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L4x16_SUB2_8 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_2 64,16, 1,0 + KERNEL4x16_I1_L4_2 64,16, 2,0 + KERNEL4x16_I1_L4_3 64,16, 3,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L4x16_SUB2_4 + LOAD4x16_0 + KERNEL4x16_I1_L4_2 64,16, 0,0 + KERNEL4x16_I1_L4_3 64,16, 1,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L4x16_SUB2_2 + LOAD4x16_0 + KERNEL4x16_I1_L4_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L4x16_SUB2_1 + LOAD4x16_0 + KERNEL4x16_I1_L2_3 64,16, 0,1 + MY_ALIGN +LSGEMM_L4x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L4x16_SAVE + KERNEL4x16 0 +# addic. L, L, -1 +# bgt LSGEMM_L4x16_SUB2 + + MY_ALIGN +LSGEMM_L4x16_SAVE: + SAVE4x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L4x16_BEGIN + MY_ALIGN +LSGEMM_L4x16_END: +LSGEMM_L4x8_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 8 + ble LSGEMM_L4x8_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x8 + ble LSGEMM_L4x8_SUB0 + + MY_ALIGN +LSGEMM_L4x8_LOOP_START: + + LOAD4x8_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x8_LOOP: + + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_2 32,16, 1,0 + KERNEL4x8_I1_L4_2 32,16, 2,0 + KERNEL4x8_I1_L4_2 32,16, 3,1 + + bdnz LSGEMM_L4x8_LOOP + + MY_ALIGN +LSGEMM_L4x8_LOOP_END: + + END4x8 0, AO, BO, 32, 16 + + b LSGEMM_L4x8_SUB1 + MY_ALIGN +LSGEMM_L4x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x8_SUB2 + MY_ALIGN +LSGEMM_L4x8_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x8_SAVE + MY_ALIGN +LSGEMM_L4x8_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x8_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x8_SUB2_LOOP: + LOAD4x8_0 + KERNEL4x8_I1_L4_2 32,16, 0,0 + KERNEL4x8_I1_L4_3 32,16, 1,1 + bdnz LSGEMM_L4x8_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x8_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x8_SUB2_2 + LOAD4x8_0 + KERNEL4x8_I1_L4_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x8_SUB2_1 + LOAD4x8_0 + KERNEL4x8_I1_L2_3 32,16, 0,1 + MY_ALIGN +LSGEMM_L4x8_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x8_SAVE + KERNEL4x8 0 + + + MY_ALIGN +LSGEMM_L4x8_SAVE: + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4 +#endif + MY_ALIGN +LSGEMM_L4x8_END: +LSGEMM_L4x4_BEGIN: + andi. T2, M, 15 + ble LSGEMM_L4x1_END + + andi. T1, M, 4 + ble LSGEMM_L4x4_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,4 + mr T12, T11 + addi T12,T12, -1 + srawi. L, T12, 4 /**(T11-1) % 16x */ +#else + mr T12, K + addi T12,T12, -1 + srawi. L, T12, 4 /**(K-1) % 16x */ +#endif + + ZERO4x4 + ble LSGEMM_L4x4_SUB0 + + MY_ALIGN +LSGEMM_L4x4_LOOP_START: + + LOAD4x4_0 /*we already zeroed */ + mtctr L + + MY_ALIGN + +LSGEMM_L4x4_LOOP: + + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_2 16,16, 1,0 + KERNEL4x4_I1_L4_2 16,16, 2,0 + KERNEL4x4_I1_L4_2 16,16, 3,1 + + bdnz LSGEMM_L4x4_LOOP + + MY_ALIGN +LSGEMM_L4x4_LOOP_END: + + END4x4 0, AO, BO, 16, 16 + + b LSGEMM_L4x4_SUB1 + MY_ALIGN +LSGEMM_L4x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 31 +#else + andi. L, K, 31 +#endif + b LSGEMM_L4x4_SUB2 + MY_ALIGN +LSGEMM_L4x4_SUB1: +#if defined(TRMMKERNEL) + andi. L, T12, 15 +#else + andi. L, T12, 15 +#endif + ble LSGEMM_L4x4_SAVE + MY_ALIGN +LSGEMM_L4x4_SUB2: + + srawi. T1,L, 3 + ble LSGEMM_L4x4_SUB2_4 + mtctr T1 + MY_ALIGN +LSGEMM_L4x4_SUB2_LOOP: + LOAD4x4_0 + KERNEL4x4_I1_L4_2 16,16, 0,0 + KERNEL4x4_I1_L4_3 16,16, 1,1 + bdnz LSGEMM_L4x4_SUB2_LOOP + MY_ALIGN +LSGEMM_L4x4_SUB2_4: + andi. T1,L, 4 + ble LSGEMM_L4x4_SUB2_2 + LOAD4x4_0 + KERNEL4x4_I1_L4_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x4_SUB2_1 + LOAD4x4_0 + KERNEL4x4_I1_L2_3 16,16, 0,1 + MY_ALIGN +LSGEMM_L4x4_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x4_SAVE + KERNEL4x4 0 + + + MY_ALIGN +LSGEMM_L4x4_SAVE: + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4 +#endif + MY_ALIGN +LSGEMM_L4x4_END: +LSGEMM_L4x2_BEGIN: + andi. T1, M, 2 + ble LSGEMM_L4x2_END + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x2 + ble LSGEMM_L4x2_SUB0 + + MY_ALIGN +LSGEMM_L4x2_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x2_LOOP: + + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,0 + KERNEL4x2_2 0,0, 2,0 + KERNEL4x2_2 0,0, 3,1 + + bdnz LSGEMM_L4x2_LOOP + + MY_ALIGN +LSGEMM_L4x2_LOOP_END: + +LSGEMM_L4x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x2_SAVE + MY_ALIGN +LSGEMM_L4x2_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x2_SUB2_2 + KERNEL4x2_2 0,0, 0,0 + KERNEL4x2_2 0,0, 1,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x2_SUB2_1 + KERNEL4x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x2_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +LSGEMM_L4x2_SAVE: + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4 +#endif + MY_ALIGN +LSGEMM_L4x2_END: +LSGEMM_L4x1_BEGIN: + andi. T1, M, 1 + ble LSGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 + srawi. L, T11, 3 /**(T11) % 8x */ +#else + srawi. L, K, 3 /**(K) % 8x */ +#endif + + ZERO4x1 + ble LSGEMM_L4x1_SUB0 + + MY_ALIGN +LSGEMM_L4x1_LOOP_START: + mtctr L + + MY_ALIGN + +LSGEMM_L4x1_LOOP: + + KERNEL4x1_4 0,0, 0,0 + KERNEL4x1_4 0,0, 1,1 + + bdnz LSGEMM_L4x1_LOOP + + MY_ALIGN +LSGEMM_L4x1_LOOP_END: + +LSGEMM_L4x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 7 +#else + andi. L, K, 7 +#endif + ble LSGEMM_L4x1_SAVE + MY_ALIGN +LSGEMM_L4x1_SUB2: + andi. T1,L, 4 + ble LSGEMM_L4x1_SUB2_2 + KERNEL4x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L4x1_SUB2_2: + andi. T1,L, 2 + ble LSGEMM_L4x1_SUB2_1 + KERNEL4x1_2 + MY_ALIGN +LSGEMM_L4x1_SUB2_1: + andi. T1,L, 1 + ble LSGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +LSGEMM_L4x1_SAVE: + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4 +#endif + MY_ALIGN +LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + + andi. T2, N, 3 + ble .L999 + +LSGEMM_L4_END: + andi. T1, N, 2 + ble LSGEMM_L2_END +LSGEMM_L2_BEGIN: + + + mr AO, A + mr CO, C + slwi T3, LDC , 1 + add C, C, T3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_L2x16_END + + MY_ALIGN +LSGEMM_L2x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x16 + ble LSGEMM_L2x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x16_LOOP: + + KERNEL2x16_4 -2048,0, 0,0 + KERNEL2x16_4 -2048,0, 1,0 + KERNEL2x16_4 -2048,0, 2,0 + KERNEL2x16_4 -2048,0, 3,0 + KERNEL2x16_4 -2048,0, 4,0 + KERNEL2x16_4 -2048,0, 5,0 + KERNEL2x16_4 -2048,0, 6,0 + KERNEL2x16_4 -2048,0, 7,0 + KERNEL2x16_4 -2048,0, 8,0 + KERNEL2x16_4 -2048,0, 9,0 + KERNEL2x16_4 -2048,0, 10,0 + KERNEL2x16_4 -2048,0, 11,0 + KERNEL2x16_4 -2048,0, 12,0 + KERNEL2x16_4 -2048,0, 13,0 + KERNEL2x16_4 -2048,0, 14,0 + KERNEL2x16_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x16_SAVE + MY_ALIGN +LSGEMM_L2x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x16_SUB2_16 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,0 + KERNEL2x16_4 0,0, 4,0 + KERNEL2x16_4 0,0, 5,0 + KERNEL2x16_4 0,0, 6,0 + KERNEL2x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x16_SUB2_8 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,0 + KERNEL2x16_4 0,0, 2,0 + KERNEL2x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x16_SUB2_4 + KERNEL2x16_4 0,0, 0,0 + KERNEL2x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x16_SUB2_2 + KERNEL2x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x16_SUB2_1 + KERNEL2x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x16_SAVE + KERNEL2x16 + + MY_ALIGN +LSGEMM_L2x16_SAVE: + SAVE2x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2 +#endif + addic. I, I, -1 + bgt+ LSGEMM_L2x16_BEGIN + MY_ALIGN +LSGEMM_L2x16_END: + andi. I, M, 8 + ble LSGEMM_L2x8_END + + MY_ALIGN +LSGEMM_L2x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x8 + ble LSGEMM_L2x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_L2x8_LOOP: + + KERNEL2x8_4 -2048,0, 0,0 + KERNEL2x8_4 -2048,0, 1,0 + KERNEL2x8_4 -2048,0, 2,0 + KERNEL2x8_4 -2048,0, 3,0 + KERNEL2x8_4 -2048,0, 4,0 + KERNEL2x8_4 -2048,0, 5,0 + KERNEL2x8_4 -2048,0, 6,0 + KERNEL2x8_4 -2048,0, 7,0 + KERNEL2x8_4 -2048,0, 8,0 + KERNEL2x8_4 -2048,0, 9,0 + KERNEL2x8_4 -2048,0, 10,0 + KERNEL2x8_4 -2048,0, 11,0 + KERNEL2x8_4 -2048,0, 12,0 + KERNEL2x8_4 -2048,0, 13,0 + KERNEL2x8_4 -2048,0, 14,0 + KERNEL2x8_4 -2048,0, 15,1 + + bdnz LSGEMM_L2x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_L2x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x8_SAVE + MY_ALIGN +LSGEMM_L2x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x8_SUB2_16 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,0 + KERNEL2x8_4 0,0, 4,0 + KERNEL2x8_4 0,0, 5,0 + KERNEL2x8_4 0,0, 6,0 + KERNEL2x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x8_SUB2_8 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,0 + KERNEL2x8_4 0,0, 2,0 + KERNEL2x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x8_SUB2_4 + KERNEL2x8_4 0,0, 0,0 + KERNEL2x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x8_SUB2_2 + KERNEL2x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +LSGEMM_L2x8_SAVE: + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2 +#endif + MY_ALIGN +LSGEMM_L2x8_END: + andi. I, M, 4 + ble LSGEMM_L2x4_END + + MY_ALIGN +LSGEMM_L2x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x4 + ble LSGEMM_L2x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x4_LOOP: + + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,0 + KERNEL2x4_4 0,0, 8,0 + KERNEL2x4_4 0,0, 9,0 + KERNEL2x4_4 0,0, 10,0 + KERNEL2x4_4 0,0, 11,0 + KERNEL2x4_4 0,0, 12,0 + KERNEL2x4_4 0,0, 13,0 + KERNEL2x4_4 0,0, 14,0 + KERNEL2x4_4 0,0, 15,1 + + bdnz LSGEMM_L2x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x4_SAVE + MY_ALIGN +LSGEMM_L2x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x4_SUB2_16 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,0 + KERNEL2x4_4 0,0, 4,0 + KERNEL2x4_4 0,0, 5,0 + KERNEL2x4_4 0,0, 6,0 + KERNEL2x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x4_SUB2_8 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,0 + KERNEL2x4_4 0,0, 2,0 + KERNEL2x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x4_SUB2_4 + KERNEL2x4_4 0,0, 0,0 + KERNEL2x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x4_SUB2_2 + KERNEL2x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x4_SAVE + KERNEL2x4 + + MY_ALIGN +LSGEMM_L2x4_SAVE: + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2 +#endif + MY_ALIGN +LSGEMM_L2x4_END: + andi. I, M, 2 + ble LSGEMM_L2x2_END + + MY_ALIGN +LSGEMM_L2x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x2 + ble LSGEMM_L2x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x2_LOOP: + + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,0 + KERNEL2x2_4 0,0, 8,0 + KERNEL2x2_4 0,0, 9,0 + KERNEL2x2_4 0,0, 10,0 + KERNEL2x2_4 0,0, 11,0 + KERNEL2x2_4 0,0, 12,0 + KERNEL2x2_4 0,0, 13,0 + KERNEL2x2_4 0,0, 14,0 + KERNEL2x2_4 0,0, 15,1 + + bdnz LSGEMM_L2x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x2_SAVE + MY_ALIGN +LSGEMM_L2x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x2_SUB2_16 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,0 + KERNEL2x2_4 0,0, 4,0 + KERNEL2x2_4 0,0, 5,0 + KERNEL2x2_4 0,0, 6,0 + KERNEL2x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x2_SUB2_8 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,0 + KERNEL2x2_4 0,0, 2,0 + KERNEL2x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x2_SUB2_4 + KERNEL2x2_4 0,0, 0,0 + KERNEL2x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x2_SUB2_2 + KERNEL2x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +LSGEMM_L2x2_SAVE: + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2 +#endif + MY_ALIGN +LSGEMM_L2x2_END: + andi. I, M, 1 + ble LSGEMM_L2x1_END + + MY_ALIGN +LSGEMM_L2x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO2x1 + ble LSGEMM_L2x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_L2x1_LOOP: + + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,0 + KERNEL2x1_4 0,0, 8,0 + KERNEL2x1_4 0,0, 9,0 + KERNEL2x1_4 0,0, 10,0 + KERNEL2x1_4 0,0, 11,0 + KERNEL2x1_4 0,0, 12,0 + KERNEL2x1_4 0,0, 13,0 + KERNEL2x1_4 0,0, 14,0 + KERNEL2x1_4 0,0, 15,1 + + bdnz LSGEMM_L2x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_L2x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_L2x1_SAVE + MY_ALIGN +LSGEMM_L2x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_L2x1_SUB2_16 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,0 + KERNEL2x1_4 0,0, 4,0 + KERNEL2x1_4 0,0, 5,0 + KERNEL2x1_4 0,0, 6,0 + KERNEL2x1_4 0,0, 7,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_L2x1_SUB2_8 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,0 + KERNEL2x1_4 0,0, 2,0 + KERNEL2x1_4 0,0, 3,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_L2x1_SUB2_4 + KERNEL2x1_4 0,0, 0,0 + KERNEL2x1_4 0,0, 1,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_L2x1_SUB2_2 + KERNEL2x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_L2x1_SUB2_1 + KERNEL2x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_L2x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +LSGEMM_L2x1_SAVE: + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2 +#endif + MY_ALIGN +LSGEMM_L2x1_END: + slwi T1, K, 3 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif +LSGEMM_L2_END: + andi. T1, N, 1 + ble LSGEMM_END +LSGEMM_1_BEGIN: + + + mr AO, A + mr CO, C + add C, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 4 + ble LSGEMM_1x16_END + + MY_ALIGN +LSGEMM_1x16_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x16 + ble LSGEMM_1x16_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x16_LOOP: + + KERNEL1x16_4 -2048,0, 0,0 + KERNEL1x16_4 -2048,0, 1,0 + KERNEL1x16_4 -2048,0, 2,0 + KERNEL1x16_4 -2048,0, 3,0 + KERNEL1x16_4 -2048,0, 4,0 + KERNEL1x16_4 -2048,0, 5,0 + KERNEL1x16_4 -2048,0, 6,0 + KERNEL1x16_4 -2048,0, 7,0 + KERNEL1x16_4 -2048,0, 8,0 + KERNEL1x16_4 -2048,0, 9,0 + KERNEL1x16_4 -2048,0, 10,0 + KERNEL1x16_4 -2048,0, 11,0 + KERNEL1x16_4 -2048,0, 12,0 + KERNEL1x16_4 -2048,0, 13,0 + KERNEL1x16_4 -2048,0, 14,0 + KERNEL1x16_4 -2048,0, 15,1 + + bdnz LSGEMM_1x16_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x16_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x16_SAVE + MY_ALIGN +LSGEMM_1x16_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x16_SUB2_16 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,0 + KERNEL1x16_4 0,0, 4,0 + KERNEL1x16_4 0,0, 5,0 + KERNEL1x16_4 0,0, 6,0 + KERNEL1x16_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x16_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x16_SUB2_8 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,0 + KERNEL1x16_4 0,0, 2,0 + KERNEL1x16_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x16_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x16_SUB2_4 + KERNEL1x16_4 0,0, 0,0 + KERNEL1x16_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x16_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x16_SUB2_2 + KERNEL1x16_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x16_SUB2_1 + KERNEL1x16_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x16_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x16_SAVE + KERNEL1x16 + + MY_ALIGN +LSGEMM_1x16_SAVE: + SAVE1x16 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1 +#endif + addic. I, I, -1 + bgt+ LSGEMM_1x16_BEGIN + MY_ALIGN +LSGEMM_1x16_END: + andi. I, M, 8 + ble LSGEMM_1x8_END + + MY_ALIGN +LSGEMM_1x8_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x8 + ble LSGEMM_1x8_SUB0 + addi AO,AO,2048 + + mtctr L + + MY_ALIGN + +LSGEMM_1x8_LOOP: + + KERNEL1x8_4 -2048,0, 0,0 + KERNEL1x8_4 -2048,0, 1,0 + KERNEL1x8_4 -2048,0, 2,0 + KERNEL1x8_4 -2048,0, 3,0 + KERNEL1x8_4 -2048,0, 4,0 + KERNEL1x8_4 -2048,0, 5,0 + KERNEL1x8_4 -2048,0, 6,0 + KERNEL1x8_4 -2048,0, 7,0 + KERNEL1x8_4 -2048,0, 8,0 + KERNEL1x8_4 -2048,0, 9,0 + KERNEL1x8_4 -2048,0, 10,0 + KERNEL1x8_4 -2048,0, 11,0 + KERNEL1x8_4 -2048,0, 12,0 + KERNEL1x8_4 -2048,0, 13,0 + KERNEL1x8_4 -2048,0, 14,0 + KERNEL1x8_4 -2048,0, 15,1 + + bdnz LSGEMM_1x8_LOOP + MY_ALIGN + addi AO,AO, -2048 + MY_ALIGN +LSGEMM_1x8_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x8_SAVE + MY_ALIGN +LSGEMM_1x8_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x8_SUB2_16 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,0 + KERNEL1x8_4 0,0, 4,0 + KERNEL1x8_4 0,0, 5,0 + KERNEL1x8_4 0,0, 6,0 + KERNEL1x8_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x8_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x8_SUB2_8 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,0 + KERNEL1x8_4 0,0, 2,0 + KERNEL1x8_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x8_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x8_SUB2_4 + KERNEL1x8_4 0,0, 0,0 + KERNEL1x8_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x8_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x8_SUB2_2 + KERNEL1x8_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x8_SUB2_1 + KERNEL1x8_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x8_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x8_SAVE + KERNEL1x8 + + MY_ALIGN +LSGEMM_1x8_SAVE: + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1 +#endif + MY_ALIGN +LSGEMM_1x8_END: + andi. I, M, 4 + ble LSGEMM_1x4_END + + MY_ALIGN +LSGEMM_1x4_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x4 + ble LSGEMM_1x4_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x4_LOOP: + + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,0 + KERNEL1x4_4 0,0, 8,0 + KERNEL1x4_4 0,0, 9,0 + KERNEL1x4_4 0,0, 10,0 + KERNEL1x4_4 0,0, 11,0 + KERNEL1x4_4 0,0, 12,0 + KERNEL1x4_4 0,0, 13,0 + KERNEL1x4_4 0,0, 14,0 + KERNEL1x4_4 0,0, 15,1 + + bdnz LSGEMM_1x4_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x4_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x4_SAVE + MY_ALIGN +LSGEMM_1x4_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x4_SUB2_16 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,0 + KERNEL1x4_4 0,0, 4,0 + KERNEL1x4_4 0,0, 5,0 + KERNEL1x4_4 0,0, 6,0 + KERNEL1x4_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x4_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x4_SUB2_8 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,0 + KERNEL1x4_4 0,0, 2,0 + KERNEL1x4_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x4_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x4_SUB2_4 + KERNEL1x4_4 0,0, 0,0 + KERNEL1x4_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x4_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x4_SUB2_2 + KERNEL1x4_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x4_SUB2_1 + KERNEL1x4_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x4_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x4_SAVE + KERNEL1x4 + + MY_ALIGN +LSGEMM_1x4_SAVE: + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1 +#endif + MY_ALIGN +LSGEMM_1x4_END: + andi. I, M, 2 + ble LSGEMM_1x2_END + + MY_ALIGN +LSGEMM_1x2_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x2 + ble LSGEMM_1x2_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x2_LOOP: + + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,0 + KERNEL1x2_4 0,0, 8,0 + KERNEL1x2_4 0,0, 9,0 + KERNEL1x2_4 0,0, 10,0 + KERNEL1x2_4 0,0, 11,0 + KERNEL1x2_4 0,0, 12,0 + KERNEL1x2_4 0,0, 13,0 + KERNEL1x2_4 0,0, 14,0 + KERNEL1x2_4 0,0, 15,1 + + bdnz LSGEMM_1x2_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x2_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x2_SAVE + MY_ALIGN +LSGEMM_1x2_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x2_SUB2_16 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,0 + KERNEL1x2_4 0,0, 4,0 + KERNEL1x2_4 0,0, 5,0 + KERNEL1x2_4 0,0, 6,0 + KERNEL1x2_4 0,0, 7,1 + MY_ALIGN +LSGEMM_1x2_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x2_SUB2_8 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,0 + KERNEL1x2_4 0,0, 2,0 + KERNEL1x2_4 0,0, 3,1 + MY_ALIGN +LSGEMM_1x2_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x2_SUB2_4 + KERNEL1x2_4 0,0, 0,0 + KERNEL1x2_4 0,0, 1,1 + MY_ALIGN +LSGEMM_1x2_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x2_SUB2_2 + KERNEL1x2_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x2_SUB2_1 + KERNEL1x2_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x2_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x2_SAVE + KERNEL1x2 + + MY_ALIGN +LSGEMM_1x2_SAVE: + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1 +#endif + MY_ALIGN +LSGEMM_1x2_END: + andi. I, M, 1 + ble LSGEMM_1x1_END + + MY_ALIGN +LSGEMM_1x1_BEGIN: + +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif + +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 + srawi. L, T11, 6 /**(T11 ) % 64x */ +#else + srawi. L, K, 6 /**(K ) % 64x */ +#endif + + ZERO1x1 + ble LSGEMM_1x1_SUB0 + + + mtctr L + + MY_ALIGN + +LSGEMM_1x1_LOOP: + + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,0 + KERNEL1x1_16 0,0, 2,0 + KERNEL1x1_16 0,0, 3,1 + + bdnz LSGEMM_1x1_LOOP + MY_ALIGN + + MY_ALIGN +LSGEMM_1x1_SUB0: +#if defined(TRMMKERNEL) + andi. L, T11, 63 +#else + andi. L, K, 63 +#endif + ble LSGEMM_1x1_SAVE + MY_ALIGN +LSGEMM_1x1_SUB2: + andi. T10,L, 32 + ble LSGEMM_1x1_SUB2_16 + KERNEL1x1_16 0,0, 0,0 + KERNEL1x1_16 0,0, 1,1 + MY_ALIGN +LSGEMM_1x1_SUB2_16: + andi. T10,L, 16 + ble LSGEMM_1x1_SUB2_8 + KERNEL1x1_16 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_8: + andi. T10,L, 8 + ble LSGEMM_1x1_SUB2_4 + KERNEL1x1_8 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_4: + andi. T10,L, 4 + ble LSGEMM_1x1_SUB2_2 + KERNEL1x1_4 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_2: + andi. T10,L, 2 + ble LSGEMM_1x1_SUB2_1 + KERNEL1x1_2 0,0, 0,1 + MY_ALIGN +LSGEMM_1x1_SUB2_1: + andi. T10,L, 1 + ble LSGEMM_1x1_SAVE + KERNEL1x1 + + MY_ALIGN +LSGEMM_1x1_SAVE: + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1 +#endif + MY_ALIGN +LSGEMM_1x1_END: + slwi T1, K, 2 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif LSGEMM_END: \ No newline at end of file diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S index 2c9e537c7e..3750d338d5 100644 --- a/kernel/power/sgemm_macros_power9.S +++ b/kernel/power/sgemm_macros_power9.S @@ -1,5575 +1,5575 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 4 -#define DISP64(ind,disp) (ind*unit_size*64+disp) -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -/********************************************************************************************** -* Macros for N=8 and M=16 -**********************************************************************************************/ - - - -.macro KERNEL8x16_L1_L4 Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - -.macro LOAD8x16 OffsetA,OffsetB - - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endm - -.macro END8x16_NORMAL - END8x16 0, AO, BO, 64,32 -.endm - -.macro END8x16_WITHOUT_ADD - END8x16 0, AO,BO,0,0 -.endm - -.macro END8x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - xvmulsp vs50, vs2,vs28 - xvmulsp vs51, vs3,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - xvmulsp vs54, vs2,vs29 - xvmulsp vs55, vs3,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - xvmulsp vs58, vs2,vs30 - xvmulsp vs59, vs3,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - xvmulsp vs62, vs2,vs31 - xvmulsp vs63, vs3,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 - -.endif -.endm - -.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - -KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 -KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete - -.endm - -.macro KERNEL8x16 First - - LOAD8x16 0,0 - END8x16 \First, AO, BO, 64,32 -.endm - -.macro LOAD8x16_2 - LOAD8x16_2O AO,BO, 0,0 -.endm - -.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB - lxv vs8, (\OffsetB)(\BREG) - lxv vs12, (16+\OffsetB)(\BREG) - lxv vs24, (32+\OffsetB)(\BREG) - lxv vs28, (32+16+\OffsetB)(\BREG) - lxv vs4, (0+\OffsetA)(\AREG) - lxv vs5, (16+\OffsetA)(\AREG) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(\AREG) - lxv vs7, (48+\OffsetA)(\AREG) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(\AREG) - lxv vs1, (64+16+\OffsetA)(\AREG) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(\AREG) - lxv vs3, (64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - -.macro END8x16_2 - /*for load2 offset will be 128 and 64*/ - KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast - KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.if \Complete==0 - lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP16(\Index,\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - -.if \Complete==0 - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif -.if \Complete==0 - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,\OffsetB) - addi \AREG, \AREG, DISP32(\Index,\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - - -.macro SAVE8x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - - - - /* permute to restore butterfly rank 1 updateto normal promoted one */ - /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ - /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ - /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ - /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) -#endif - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 -#ifndef TRMMKERNEL - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) -#endif - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - - - -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 -#ifndef TRMMKERNEL - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 -#ifndef TRMMKERNEL - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - - - - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - - stxv vs32, 0(CO) - stxv vs33, 16(CO) -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - stxv vs34, 32(CO) - stxv vs35, 48(CO) -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T1) - stxv vs37, 16(T1) -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - stxv vs38, 32(T1) - stxv vs39, 48(T1) - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - - stxv vs40, 0(T2) - stxv vs41, 16(T2) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T2) - stxv vs43, 48(T2) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - stxv vs44, 0(T3) - stxv vs45, 16(T3) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - /*****the same with the second 8X8 ****/ - #ifndef TRMMKERNEL - lxv vs32, 0(T4) - lxv vs33, 16(T4) -#endif - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs34, 32(T4) - lxv vs35, 48(T4) -#endif - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 -#ifndef TRMMKERNEL - lxv vs36, 0(T5) - lxv vs37, 16(T5) -#endif - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 -#ifndef TRMMKERNEL - lxv vs38,32(T5) - lxv vs39, 48(T5) -#endif - - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 -#ifndef TRMMKERNEL - lxv vs40, 0(T6) - lxv vs41, 16(T6) -#endif - xxmrglw vs16, vs50, vs62 - xxmrglw vs18, vs54, vs58 -#ifndef TRMMKERNEL - lxv vs42, 32(T6) - lxv vs43, 48(T6) -#endif - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - xxmrghw vs4, vs54, vs58 - xxmrghw vs5, vs50, vs62 -#ifndef TRMMKERNEL - lxv vs44, 0(T7) - lxv vs45, 16(T7) -#endif - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs51, vs63 - xxmrglw vs26, vs55, vs59 -#ifndef TRMMKERNEL - lxv vs46, 32(T7) - lxv vs47, 48(T7) -#endif - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - xxmrghw vs30, vs55, vs59 - xxmrghw vs31, vs51, vs63 - - - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - #ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r -#endif - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - stxv vs32, 0(T4) - stxv vs33, 16(T4) - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - -#ifdef TRMMKERNEL - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r -#else - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r -#endif - stxv vs34, 32(T4) - stxv vs35, 48(T4) - -#ifdef TRMMKERNEL - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r -#else - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r -#endif - stxv vs36, 0(T5) - stxv vs37, 16(T5) - -#ifdef TRMMKERNEL - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - - - stxv vs38, 32(T5) - stxv vs39, 48(T5) - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r -#else - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r -#endif - stxv vs40, 0(T6) - stxv vs41, 16(T6) -#ifdef TRMMKERNEL - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r -#else - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r -#endif - stxv vs42, 32(T6) - stxv vs43, 48(T6) -#ifdef TRMMKERNEL - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r -#else - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r -#endif - - stxv vs44, 0(T7) - stxv vs45, 16(T7) -#ifdef TRMMKERNEL - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r -#endif - - stxv vs46, 32(T7) - stxv vs47, 48(T7) - - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=8 and M=8 -**********************************************************************************************/ - -.macro LOAD8x8_1 - LOAD8x8 1 -.endm - -.macro LOAD8x8_0 - LOAD8x8 0 -.endm - -.macro KERNEL8x8_L1_L4 Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END8x8_NORMAL - END8x8 0, AO, BO, 32,32 -.endm - -.macro Zero8X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - -.endm - -.macro LOAD8x8 Zero - - lxv vs24, 0(BO) - lxv vs28, 16(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endif -.endm - - -.macro END8x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.endm - -.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) - lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - - -.if \Complete==0 - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP32(\Index,128) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endm - -.macro KERNEL8x8 First - - LOAD8x8 0 - END8x8 \First, AO, BO, 32,32 -.endm - -.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - xvmulsp vs48, vs0,vs28 - xvmulsp vs49, vs1,vs28 - - xvmulsp vs52, vs0,vs29 - xvmulsp vs53, vs1,vs29 - - xvmulsp vs56, vs0,vs30 - xvmulsp vs57, vs1,vs30 - - xvmulsp vs60, vs0,vs31 - xvmulsp vs61, vs1,vs31 - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - -.endif -.if \Complete==0 - lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - - xvmulsp vs48, vs4,vs12 - xvmulsp vs49, vs5,vs12 - - xvmulsp vs52, vs4,vs13 - xvmulsp vs53, vs5,vs13 - - xvmulsp vs56, vs4,vs14 - xvmulsp vs57, vs5,vs14 - - xvmulsp vs60, vs4,vs15 - xvmulsp vs61, vs5,vs15 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 - -.endif - -.endm - - -.macro SAVE8x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - add T4, T2, T10 - add T5, T3, T10 - - add T6, T4, T10 - add T7, T5, T10 - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - lxv vs50, 0(T4) - lxv vs51, 16(T4) - lxv vs54, 0(T5) - lxv vs55, 16(T5) - lxv vs58, 0(T6) - lxv vs59, 16(T6) - lxv vs62, 0(T7) - lxv vs63, 16(T7) -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - xxmrglw vs8, vs48, vs60 - xxmrglw vs10, vs52, vs56 - - xxmrghw vs1, vs48, vs60 - xxmrghw vs0, vs52, vs56 - stxv vs34, 0(CO) - stxv vs35, 16(CO) - xxmrglw vs12, vs49, vs61 - xxmrglw vs14, vs53, vs57 - stxv vs38, 0(T1) - stxv vs39, 16(T1) - xxmrghw vs2, vs53, vs57 - xxmrghw vs3, vs49, vs61 - stxv vs42, 0(T2) - stxv vs43, 16(T2) - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - stxv vs46, 0(T3) - stxv vs47, 16(T3) - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - - - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - #ifdef TRMMKERNEL - xvmulsp vs50, vs8, alpha_r - xvmulsp vs51, vs12, alpha_r - xvmulsp vs54, vs9, alpha_r - xvmulsp vs55, vs13, alpha_r - xvmulsp vs58, vs10, alpha_r - xvmulsp vs59, vs14, alpha_r - xvmulsp vs62, vs11, alpha_r - xvmulsp vs63, vs15, alpha_r -#else - xvmaddasp vs50, vs8, alpha_r - xvmaddasp vs51, vs12, alpha_r - xvmaddasp vs54, vs9, alpha_r - xvmaddasp vs55, vs13, alpha_r - xvmaddasp vs58, vs10, alpha_r - xvmaddasp vs59, vs14, alpha_r - xvmaddasp vs62, vs11, alpha_r - xvmaddasp vs63, vs15, alpha_r -#endif - - stxv vs50, 0(T4) - stxv vs51, 16(T4) - stxv vs54, 0(T5) - stxv vs55, 16(T5) - stxv vs58, 0(T6) - stxv vs59, 16(T6) - stxv vs62, 0(T7) - stxv vs63, 16(T7) - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=4 -**********************************************************************************************/ - -.macro LOAD8x4_1 - LOAD8x4 1 -.endm - -.macro LOAD8x4_0 - LOAD8x4 0 -.endm - -.macro KERNEL8x4_L1_L4 Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero8X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - -.endm - -.macro LOAD8x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - lxv vs25, 16(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 -.endif -.endm - -.macro END8x4_NORMAL - END8x4 0, AO, BO, 16,32 -.endm - -.macro END8x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.endif -.endm - -.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) - lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP32(\Index,128) - -.endif -.endif - - -.endm - -.macro KERNEL8x4 First - LOAD8x4 0 - END8x4 \First, AO, BO, 16,32 -.endm - -.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - - xvmulsp vs48, vs25, vs0 - xvmulsp vs49, vs25, vs1 - xvmulsp vs50, vs25, vs2 - xvmulsp vs51, vs25, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - xvmaddasp vs48, vs25, vs0 - xvmaddasp vs49, vs25, vs1 - xvmaddasp vs50, vs25, vs2 - xvmaddasp vs51, vs25, vs3 -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) - lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - xvmulsp vs48, vs27, vs4 - xvmulsp vs49, vs27, vs5 - xvmulsp vs50, vs27, vs6 - xvmulsp vs51, vs27, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - xvmaddasp vs48, vs27, vs4 - xvmaddasp vs49, vs27, vs5 - xvmaddasp vs50, vs27, vs6 - xvmaddasp vs51, vs27, vs7 -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - - -.macro SAVE8x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - add T4, T2, T10 - add T5, T3, T10 -#if !defined(TRMMKERNEL) - lxv vs40, 0(T4) - lxv vs41, 0(T5) -#endif - add T6, T4, T10 - add T7, T5, T10 -#if !defined(TRMMKERNEL) - lxv vs42, 0(T6) - lxv vs43, 0(T7) -#endif - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - - xxmrglw vs0, vs51,vs48 - xxmrglw vs1, vs50,vs49 - xxmrglw vs4, vs48,vs51 - xxmrglw vs5, vs49,vs50 - - xxmrghw vs2, vs51,vs48 - xxmrghw vs3, vs50,vs49 - xxmrghw vs6, vs48,vs51 - xxmrghw vs7, vs49,vs50 - - xxmrgld vs28, vs1, vs0 - xxmrghd vs29,vs5,vs4 - - xxmrgld vs30, vs2, vs3 - xxmrghd vs31,vs6,vs7 -#if defined(TRMMKERNEL) - - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r - xvmulsp vs40, vs28, alpha_r - xvmulsp vs41, vs29, alpha_r - xvmulsp vs42, vs30, alpha_r - xvmulsp vs43, vs31, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - xvmaddasp vs40, vs28, alpha_r - xvmaddasp vs41, vs29, alpha_r - xvmaddasp vs42, vs30, alpha_r - xvmaddasp vs43, vs31, alpha_r -#endif - - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - stxv vs40, 0(T4) - stxv vs41, 0(T5) - stxv vs42, 0(T6) - stxv vs43, 0(T7) - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=2 -**********************************************************************************************/ - - -.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero8x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - -.endm - -.macro KERNEL8x2 - KERNEL8x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP8(\Index,32) - -.endm - -.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs26, vs9 - xvmulsp vs3, vs27, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs1, vs29, vs10 - xvmulsp vs2, vs28, vs11 - xvmulsp vs3, vs29, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs26, vs9 - xvmaddasp vs3, vs27, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs1, vs29, vs10 - xvmaddasp vs2, vs28, vs11 - xvmaddasp vs3, vs29, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE8x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - lxssp v8,0(T4) - lxssp v9,4(T4) - - lxssp v10,0(T5) - lxssp v11,4(T5) - - lxssp v12,0(T6) - lxssp v13,4(T6) - - lxssp v14,0(T7) - lxssp v15,4(T7) -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - xscvspdp vs9, vs3 - xxspltw vs10, vs3, 1 - xxspltw vs11, vs3, 2 - xxspltw vs12, vs3, 3 - xscvspdp vs10,vs10 - xscvspdp vs11,vs11 - xscvspdp vs12,vs12 - - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 - - - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - xsmuldp vs40,vs12, vs4 - xsmuldp vs41,vs31, vs4 - - xsmuldp vs42,vs11, vs4 - xsmuldp vs43,vs30, vs4 - - xsmuldp vs44,vs10, vs4 - xsmuldp vs45,vs29, vs4 - - xsmuldp vs46,vs9, vs4 - xsmuldp vs47,vs28, vs4 -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - xsmaddadp vs40,vs12, vs4 - xsmaddadp vs41,vs31, vs4 - - xsmaddadp vs42,vs11, vs4 - xsmaddadp vs43,vs30, vs4 - - xsmaddadp vs44,vs10, vs4 - xsmaddadp vs45,vs29, vs4 - - xsmaddadp vs46,vs9, vs4 - xsmaddadp vs47,vs28, vs4 -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - stxssp v8,0(T4) - stxssp v9,4(T4) - - stxssp v10,0(T5) - stxssp v11,4(T5) - - stxssp v12,0(T6) - stxssp v13,4(T6) - - stxssp v14,0(T7) - stxssp v15,4(T7) - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=8 and M=1 -**********************************************************************************************/ -.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero8x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 -.endm - -.macro KERNEL8x1 - KERNEL8x1_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_2 - KERNEL8x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL8x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL8x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs27, 16(\BREG) - lxv vs28, 32(\BREG) - lxv vs29, 48(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 64 -.endm - -.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) - lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) - lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) - lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) - lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) - lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) - lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) - lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs1, vs29, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs1, vs31, vs10 - xvmulsp vs0, vs32, vs11 - xvmulsp vs1, vs33, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs1, vs29, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs0, vs32, vs11 - xvmaddasp vs1, vs33, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP32(\Index,128) -.endif -.endm - -.macro SAVE8x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - add T4, T2, T10 - add T5, T3, T10 - add T6, T4, T10 - add T7, T5, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) - lxssp v8,0(T4) - lxssp v10,0(T5) - lxssp v12,0(T6) - lxssp v14,0(T7) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - xscvspdp vs28, vs1 - xxspltw vs29, vs1, 1 - xxspltw vs30, vs1, 2 - xxspltw vs31, vs1, 3 - xscvspdp vs29,vs29 - xscvspdp vs30,vs30 - xscvspdp vs31,vs31 -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 - xsmuldp vs40,vs31, vs4 - xsmuldp vs42,vs30, vs4 - xsmuldp vs44,vs29, vs4 - xsmuldp vs46,vs28, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 - xsmaddadp vs40,vs31, vs4 - xsmaddadp vs42,vs30, vs4 - xsmaddadp vs44,vs29, vs4 - xsmaddadp vs46,vs28, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - stxssp v8,0(T4) - stxssp v10,0(T5) - stxssp v12,0(T6) - stxssp v14,0(T7) - addi CO,CO,4 -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=16 -**********************************************************************************************/ - -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm - -.macro KERNEL4x16_L1_L4 Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X16 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - -.endif -.endm - -.macro END4x16_NORMAL - END4x16 0, AO, BO, 64,16 -.endm - -.macro END4x16 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - -.endif -.endm - -.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) - lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) - lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) - lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) - lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) - lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) - lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP64(\Index,256) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endm - -.macro KERNEL4x16 First - - LOAD4x16 0 - END4x16 \First, AO, BO, 64,16 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - xvmulsp vs34, vs2,vs24 - xvmulsp vs35, vs3,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - xvmulsp vs38, vs2,vs25 - xvmulsp vs39, vs3,vs25 -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - xvmulsp vs42, vs2,vs26 - xvmulsp vs43, vs3,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - xvmulsp vs46, vs2,vs27 - xvmulsp vs47, vs3,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) - lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - xvmulsp vs34, vs6,vs8 - xvmulsp vs35, vs7,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - xvmulsp vs38, vs6,vs9 - xvmulsp vs39, vs7,vs9 -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - xvmulsp vs42, vs6,vs10 - xvmulsp vs43, vs7,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - xvmulsp vs46, vs6,vs11 - xvmulsp vs47, vs7,vs11 - - - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - - - -.endif - -.endm - - -.macro SAVE4x16 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxmrglw vs16, vs34, vs46 - xxmrglw vs18, vs38, vs42 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxmrghw vs4, vs38, vs42 - xxmrghw vs5, vs34, vs46 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxmrglw vs24, vs35, vs47 - xxmrglw vs26, vs39, vs43 - - xxlor vs17, vs16, vs16 - xxlor vs19, vs18, vs18 - - xxmrghw vs30, vs39, vs43 - xxmrghw vs31, vs35, vs47 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - -#ifndef TRMMKERNEL - lxv vs32, 0(CO) - lxv vs33, 16(CO) - lxv vs34, 32(CO) - lxv vs35, 48(CO) -#endif - xxlor vs25, vs24, vs24 - xxlor vs27, vs26, vs26 - -#ifndef TRMMKERNEL - lxv vs36, 0(T1) - lxv vs37, 16(T1) - lxv vs38, 32(T1) - lxv vs39, 48(T1) -#endif -#ifndef TRMMKERNEL - lxv vs40, 0(T2) - lxv vs41, 16(T2) - lxv vs42, 32(T2) - lxv vs43, 48(T2) -#endif -#ifndef TRMMKERNEL - lxv vs44, 0(T3) - lxv vs45, 16(T3) - lxv vs46, 32(T3) - lxv vs47, 48(T3) -#endif - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - xxperm vs16, vs4, save_permute_1 - xxperm vs18, vs5, save_permute_1 - - xxperm vs17, vs4, save_permute_2 - xxperm vs19, vs5, save_permute_2 - - xxperm vs24, vs30, save_permute_1 - xxperm vs26, vs31, save_permute_1 - - xxperm vs25, vs30, save_permute_2 - xxperm vs27, vs31, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs32, vs8, alpha_r - xvmulsp vs33, vs12, alpha_r - xvmulsp vs34, vs16, alpha_r - xvmulsp vs35, vs24, alpha_r - xvmulsp vs36, vs9, alpha_r - xvmulsp vs37, vs13, alpha_r - xvmulsp vs38, vs17, alpha_r - xvmulsp vs39, vs25, alpha_r -#else - xvmaddasp vs32, vs8, alpha_r - xvmaddasp vs33, vs12, alpha_r - xvmaddasp vs34, vs16, alpha_r - xvmaddasp vs35, vs24, alpha_r - xvmaddasp vs36, vs9, alpha_r - xvmaddasp vs37, vs13, alpha_r - xvmaddasp vs38, vs17, alpha_r - xvmaddasp vs39, vs25, alpha_r -#endif - - - -#ifdef TRMMKERNEL - xvmulsp vs40, vs10, alpha_r - xvmulsp vs41, vs14, alpha_r - xvmulsp vs42, vs18, alpha_r - xvmulsp vs43, vs26, alpha_r - xvmulsp vs44, vs11, alpha_r - xvmulsp vs45, vs15, alpha_r - xvmulsp vs46, vs19, alpha_r - xvmulsp vs47, vs27, alpha_r -#else - - xvmaddasp vs40, vs10, alpha_r - xvmaddasp vs41, vs14, alpha_r - xvmaddasp vs42, vs18, alpha_r - xvmaddasp vs43, vs26, alpha_r - xvmaddasp vs44, vs11, alpha_r - xvmaddasp vs45, vs15, alpha_r - xvmaddasp vs46, vs19, alpha_r - xvmaddasp vs47, vs27, alpha_r - -#endif - - stxv vs32, 0(CO) - stxv vs33, 16(CO) - stxv vs34, 32(CO) - stxv vs35, 48(CO) - - stxv vs36, 0(T1) - stxv vs37, 16(T1) - stxv vs38, 32(T1) - stxv vs39, 48(T1) - - stxv vs40, 0(T2) - stxv vs41, 16(T2) - stxv vs42, 32(T2) - stxv vs43, 48(T2) - stxv vs44, 0(T3) - stxv vs45, 16(T3) - stxv vs46, 32(T3) - stxv vs47, 48(T3) - - addi CO,CO,64 - - -.endm - - - -/********************************************************************************************** -* Macros for N=4 and M=8 -**********************************************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm - -.macro KERNEL4x8_L1_L4 Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro END4x8_NORMAL - END4x8 0, AO, BO, 32,16 -.endm - -.macro Zero4X8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endm - -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs0, 0(AO) - lxv vs1, 16(AO) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xxpermdi vs27, vs26, vs26,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - -.endif -.endm - - -.macro END4x8 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.endm - -.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - - lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - - xxpermdi vs27, vs26, vs26,2 - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - - lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) - - lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - - xxpermdi vs11, vs10, vs10,2 - - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - - -.if \Complete==0 - lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) - - lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - -.endif -.if \IsLast==1 -.if \Complete==1 - - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) -.else - - addi \BREG, \BREG, DISP16(\Index,64) - addi \AREG, \AREG, DISP32(\Index,128) -.endif -.endif - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif - - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - - - -.endm - -.macro KERNEL4x8 First - - LOAD4x8 0 - END4x8 \First, AO, BO, 32,16 -.endm - -.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) - - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.if \First==1 - xvmulsp vs32, vs0,vs24 - xvmulsp vs33, vs1,vs24 - - xvmulsp vs36, vs0,vs25 - xvmulsp vs37, vs1,vs25 - -.else - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - -.endif - - xxpermdi vs11, vs10, vs10,2 - -.if \First==1 - xvmulsp vs40, vs0,vs26 - xvmulsp vs41, vs1,vs26 - - xvmulsp vs44, vs0,vs27 - xvmulsp vs45, vs1,vs27 - - -.else - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - - -.endif -.if \Complete==0 - lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) - - lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) - - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) - -.else - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif -.endif - -.if \First==1 - xvmulsp vs32, vs4,vs8 - xvmulsp vs33, vs5,vs8 - - xvmulsp vs36, vs4,vs9 - xvmulsp vs37, vs5,vs9 - -.else - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - -.endif -.if \First==1 - xvmulsp vs40, vs4,vs10 - xvmulsp vs41, vs5,vs10 - - xvmulsp vs44, vs4,vs11 - xvmulsp vs45, vs5,vs11 - -.else - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - -.endif - -.endm - - -.macro SAVE4x8 - - slwi T10, LDC , 1 - add T1, CO, LDC - - add T2, CO, T10 - add T3, T1, T10 - - - -#ifndef TRMMKERNEL - lxv vs34, 0(CO) - lxv vs35, 16(CO) - lxv vs38, 0(T1) - lxv vs39, 16(T1) - lxv vs42, 0(T2) - lxv vs43, 16(T2) - lxv vs46, 0(T3) - lxv vs47, 16(T3) - - -#endif - - xxmrglw vs8, vs32, vs44 - xxmrglw vs10, vs36, vs40 - - xxmrghw vs1, vs32, vs44 - xxmrghw vs0, vs36, vs40 - - xxmrglw vs12, vs33, vs45 - xxmrglw vs14, vs37, vs41 - - xxmrghw vs2, vs37, vs41 - xxmrghw vs3, vs33, vs45 - - xxlor vs9, vs8, vs8 - xxlor vs11, vs10, vs10 - - xxlor vs13, vs12, vs12 - xxlor vs15, vs14, vs14 - - xxperm vs8, vs0, save_permute_1 - xxperm vs10, vs1, save_permute_1 - xxperm vs9, vs0, save_permute_2 - xxperm vs11, vs1, save_permute_2 - - xxperm vs12, vs2, save_permute_1 - xxperm vs14, vs3, save_permute_1 - - xxperm vs13, vs2, save_permute_2 - xxperm vs15, vs3, save_permute_2 - - - /* multiply add normal way */ - -#ifdef TRMMKERNEL - xvmulsp vs34, vs8, alpha_r - xvmulsp vs35, vs12, alpha_r - xvmulsp vs38, vs9, alpha_r - xvmulsp vs39, vs13, alpha_r - xvmulsp vs42, vs10, alpha_r - xvmulsp vs43, vs14, alpha_r - xvmulsp vs46, vs11, alpha_r - xvmulsp vs47, vs15, alpha_r -#else - xvmaddasp vs34, vs8, alpha_r - xvmaddasp vs35, vs12, alpha_r - xvmaddasp vs38, vs9, alpha_r - xvmaddasp vs39, vs13, alpha_r - xvmaddasp vs42, vs10, alpha_r - xvmaddasp vs43, vs14, alpha_r - xvmaddasp vs46, vs11, alpha_r - xvmaddasp vs47, vs15, alpha_r -#endif - - - stxv vs34, 0(CO) - stxv vs35, 16(CO) - stxv vs38, 0(T1) - stxv vs39, 16(T1) - stxv vs42, 0(T2) - stxv vs43, 16(T2) - stxv vs46, 0(T3) - stxv vs47, 16(T3) - - - addi CO,CO,32 - -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=4 -**********************************************************************************************/ - -.macro LOAD4x4_1 - LOAD4x4 1 -.endm - -.macro LOAD4x4_0 - LOAD4x4 0 -.endm - -.macro KERNEL4x4_L1_L4 Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm -.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro Zero4X4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - -.macro LOAD4x4 Zero - - lxv vs0, 0(AO) - lxv vs24, 0(BO) - - - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - -.if \Zero==1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endif -.endm - -.macro END4x4_NORMAL - END4x4 0, AO, BO, 16,16 -.endm - -.macro END4x4 First, AREG, BREG, OffsetA, OffsetB - -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.endif -.endm - -.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - - lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 - - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 - - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - - -.if \Complete==0 - - lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) - lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) - addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) - -.else - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) - -.endif -.endif - - -.endm - -.macro KERNEL4x4 First - LOAD4x4 0 - END4x4 \First, AO, BO, 16,16 -.endm - -.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - - xxperm vs6, vs4, permute_mask - xxpermdi vs5, vs4, vs4,2 - xxpermdi vs7, vs6, vs6,2 -.if \First==1 - xvmulsp vs32, vs24, vs0 - xvmulsp vs33, vs24, vs1 - xvmulsp vs34, vs24, vs2 - xvmulsp vs35, vs24, vs3 - -.else - xvmaddasp vs32, vs24, vs0 - xvmaddasp vs33, vs24, vs1 - xvmaddasp vs34, vs24, vs2 - xvmaddasp vs35, vs24, vs3 - -.endif - -.if \Complete==0 - - lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) - lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) - - xxperm vs2, vs0, permute_mask - xxpermdi vs1, vs0, vs0,2 - xxpermdi vs3, vs2, vs2,2 -.endif - -.if \First==1 - xvmulsp vs32, vs26, vs4 - xvmulsp vs33, vs26, vs5 - xvmulsp vs34, vs26, vs6 - xvmulsp vs35, vs26, vs7 - - -.else - xvmaddasp vs32, vs26, vs4 - xvmaddasp vs33, vs26, vs5 - xvmaddasp vs34, vs26, vs6 - xvmaddasp vs35, vs26, vs7 - -.endif - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) - -.else - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) - -.endif -.endif - - -.endm - - -.macro SAVE4x4 - slwi T10, LDC , 1 - add T1, CO, LDC -#if !defined(TRMMKERNEL) - lxv vs36, 0(CO) - lxv vs37, 0(T1) -#endif - add T2, CO, T10 - add T3, T1, T10 -#if !defined(TRMMKERNEL) - lxv vs38, 0(T2) - lxv vs39, 0(T3) -#endif - - xxmrglw vs0, vs35,vs32 - xxmrglw vs1, vs34,vs33 - xxmrglw vs4, vs32,vs35 - xxmrglw vs5, vs33,vs34 - - - xxmrghw vs2, vs35,vs32 - xxmrghw vs3, vs34,vs33 - xxmrghw vs6, vs32,vs35 - xxmrghw vs7, vs33,vs34 - - xxmrgld vs24, vs1, vs0 - xxmrghd vs25,vs5,vs4 - - xxmrgld vs26, vs2, vs3 - xxmrghd vs27,vs6,vs7 - - #if defined(TRMMKERNEL) - xvmulsp vs36, vs24, alpha_r - xvmulsp vs37, vs25, alpha_r - xvmulsp vs38, vs26, alpha_r - xvmulsp vs39, vs27, alpha_r -#else - xvmaddasp vs36, vs24, alpha_r - xvmaddasp vs37, vs25, alpha_r - xvmaddasp vs38, vs26, alpha_r - xvmaddasp vs39, vs27, alpha_r - #endif - stxv vs36, 0(CO) - stxv vs37, 0(T1) - stxv vs38, 0(T2) - stxv vs39, 0(T3) - - - - addi CO,CO,16 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=2 -**********************************************************************************************/ - - -.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - - -.macro Zero4x2 - xxlxor vs0, vs0, vs0 - xxlxor vs2, vs2, vs2 - -.endm - -.macro KERNEL4x2 - KERNEL4x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 0 - xxspltw vs9, vs36, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP4(\Index,16) - -.endm - -.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) - xxspltw vs8, vs4, 2 - xxspltw vs9, vs4, 3 - xxspltw vs10, vs4, 0 - xxspltw vs11, vs4, 1 - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs2, vs26, vs9 - - xvmulsp vs0, vs28, vs10 - xvmulsp vs2, vs28, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs2, vs26, vs9 - - xvmaddasp vs0, vs28, vs10 - xvmaddasp vs2, vs28, vs11 - .endif - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE4x2 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v1,4(CO) - - lxssp v2,0(T1) - lxssp v3,4(T1) - - lxssp v4,0(T2) - lxssp v5,4(T2) - - lxssp v6,0(T3) - lxssp v7,4(T3) - - -#endif - xscvspdp vs5, vs2 - xxspltw vs6, vs2, 1 - xxspltw vs7, vs2, 2 - xxspltw vs8, vs2, 3 - xscvspdp vs6,vs6 - xscvspdp vs7,vs7 - xscvspdp vs8,vs8 - - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs8, vs4 - xsmuldp vs33,vs27, vs4 - - xsmuldp vs34,vs7, vs4 - xsmuldp vs35,vs26, vs4 - - xsmuldp vs36,vs6, vs4 - xsmuldp vs37,vs25, vs4 - - xsmuldp vs38,vs5, vs4 - xsmuldp vs39,vs24, vs4 - - -#else - xsmaddadp vs32,vs8, vs4 - xsmaddadp vs33,vs27, vs4 - - xsmaddadp vs34,vs7, vs4 - xsmaddadp vs35,vs26, vs4 - - xsmaddadp vs36,vs6, vs4 - xsmaddadp vs37,vs25, vs4 - - xsmaddadp vs38,vs5, vs4 - xsmaddadp vs39,vs24, vs4 - - -#endif - - stxssp v0,0(CO) - stxssp v1,4(CO) - - stxssp v2,0(T1) - stxssp v3,4(T1) - - stxssp v4,0(T2) - stxssp v5,4(T2) - - stxssp v6,0(T3) - stxssp v7,4(T3) - - - - - addi CO,CO,8 -.endm - - -/********************************************************************************************** -* Macros for N=4 and M=1 -**********************************************************************************************/ -.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro Zero4x1 - xxlxor vs0, vs0, vs0 -.endm - -.macro KERNEL4x1 - KERNEL4x1_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_2 - KERNEL4x1_2_1 AO,BO, 0 -.endm - -.macro KERNEL4x1_1 AREG,BREG,First - lxvwsx vs8, 0, \AREG - lxv vs26, 0(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - .endif - addi \AREG, \AREG, 4 - addi \BREG, \BREG, 16 -.endm - -.macro KERNEL4x1_2_1 AREG,BREG,First - lxsd v4, 0(\AREG) - lxv vs26, 0(\BREG) - lxv vs28, 16(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - .endif - addi \AREG, \AREG, 8 - addi \BREG, \BREG, 32 -.endm - -.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast - lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) - xxspltw vs8, vs4, 3 - xxspltw vs9, vs4, 2 - xxspltw vs10, vs4, 1 - xxspltw vs11, vs4, 0 - lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) - lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) - lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) - lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs0, vs28, vs9 - xvmulsp vs0, vs30, vs10 - xvmulsp vs0, vs32, vs11 -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs0, vs28, vs9 - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs0, vs32, vs11 - .endif -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP16(\Index,64) -.endif -.endm - -.macro SAVE4x1 - slwi T10, LDC , 1 - add T1, CO, LDC - add T2, CO, T10 - add T3, T1, T10 - /*convert alpha_r for multiply*/ - xscvspdp vs4,alpha_r -/* v0 corresponds to vs32, do not forget*/ -#if !defined(TRMMKERNEL) - lxssp v0,0(CO) - lxssp v2,0(T1) - lxssp v4,0(T2) - lxssp v6,0(T3) -#endif - xscvspdp vs24, vs0 - xxspltw vs25, vs0, 1 - xxspltw vs26, vs0, 2 - xxspltw vs27, vs0, 3 - xscvspdp vs25,vs25 - xscvspdp vs26,vs26 - xscvspdp vs27,vs27 - -#if defined(TRMMKERNEL) - xsmuldp vs32,vs27, vs4 - xsmuldp vs34,vs26, vs4 - xsmuldp vs36,vs25, vs4 - xsmuldp vs38,vs24, vs4 -#else - xsmaddadp vs32,vs27, vs4 - xsmaddadp vs34,vs26, vs4 - xsmaddadp vs36,vs25, vs4 - xsmaddadp vs38,vs24, vs4 -#endif - stxssp v0,0(CO) - stxssp v2,0(T1) - stxssp v4,0(T2) - stxssp v6,0(T3) - addi CO,CO,4 -.endm - -/****************************N=2 section*****************/ - -.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - xxlxor vs6, vs6, vs6 - xxlxor vs7, vs7, vs7 -.endm - -.macro KERNEL2x16 - KERNEL2x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - xvmulsp vs6, vs28, vs9 - xvmulsp vs7, vs29, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs2, vs32, vs12 - xvmaddasp vs3, vs33, vs12 - - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - xvmaddasp vs6, vs32, vs13 - xvmaddasp vs7, vs33, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs2, vs36, vs14 - xvmaddasp vs3, vs37, vs14 - - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - xvmaddasp vs6, vs36, vs15 - xvmaddasp vs7, vs37, vs15 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - xvmaddasp vs6, vs28, vs9 - xvmaddasp vs7, vs29, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs2, vs18, vs10 - xvmaddasp vs3, vs19, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - xvmaddasp vs6, vs18, vs11 - xvmaddasp vs7, vs19, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE2x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - lxv vs28, 32(T1) - lxv vs29, 48(T1) -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r - xvmulsp vs28, vs6, alpha_r - xvmulsp vs29, vs7, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r - xvmaddasp vs28, vs6, alpha_r - xvmaddasp vs29, vs7, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - stxv vs28, 32(T1) - stxv vs29, 48(T1) - - addi CO,CO,64 - -.endm - -/* M=8 N=2 */ - -.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero2x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x8 - KERNEL2x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - xvmulsp vs4, vs26, vs9 - xvmulsp vs5, vs27, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs31, vs12 - xvmaddasp vs4, vs30, vs13 - xvmaddasp vs5, vs31, vs13 - - xvmaddasp vs0, vs34, vs14 - xvmaddasp vs1, vs35, vs14 - xvmaddasp vs4, vs34, vs15 - xvmaddasp vs5, vs35, vs15 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - xvmaddasp vs4, vs26, vs9 - xvmaddasp vs5, vs27, vs9 - - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs17, vs10 - - xvmaddasp vs4, vs16, vs11 - xvmaddasp vs5, vs17, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE2x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - lxv vs27, 16(T1) - -#endif - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs26, vs4, alpha_r - xvmulsp vs27, vs5, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs26, vs4, alpha_r - xvmaddasp vs27, vs5, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - - stxv vs26, 0(T1) - stxv vs27, 16(T1) - - addi CO,CO,32 - -.endm - - -/*M=4*/ - - -.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - /* we will aggregate on save vs0 +vs4 vs11+vs5 */ -.macro Zero2x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - - xxlxor vs4, vs4, vs4 - xxlxor vs5, vs5, vs5 - -.endm - -.macro KERNEL2x4 - KERNEL2x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs26, vs9 - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - xxspltw vs12, vs39, 3 - xxspltw vs13, vs39, 2 - xxspltw vs14, vs39, 1 - xxspltw vs15, vs39, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - - - xvmaddasp vs0, vs30, vs12 - xvmaddasp vs1, vs30, vs13 - xvmaddasp vs4, vs34, vs14 - xvmaddasp vs5, vs34, vs15 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 3 - xxspltw vs9, vs36, 2 - xxspltw vs10, vs36, 1 - xxspltw vs11, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs4, vs16, vs10 - xvmaddasp vs5, vs16, vs11 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE2x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxv vs26, 0(T1) - -#endif - /*aggregate vectors*/ - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs26, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs26, vs1, alpha_r -#endif - - stxv vs16, 0(CO) - stxv vs26, 0(T1) - - addi CO,CO,16 - -.endm - - -/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ -.macro SWITCH_PERMUTE_INNER - xxpermdi permute_mask, permute_mask, permute_mask,2 -.endm - -.macro Zero2x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - SWITCH_PERMUTE_INNER -.endm - -.macro KERNEL2x2 - KERNEL2x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxperm vs9, vs36, permute_mask - lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs37, vs36 - xvmulsp vs1, vs37, vs9 - -.else - xvmaddasp vs0, vs37, vs36 - xvmaddasp vs1, vs37, vs9 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP2(\Index,8) - -.endm - - - - -.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - xxperm vs11, vs10, permute_mask - - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - xvmaddasp vs0, vs16, vs10 - xvmaddasp vs1, vs16, vs11 - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - -.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - - xxperm vs9, vs8, permute_mask - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs26, vs9 - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP4(\Index,16) -.endif -.endm - - -.macro SAVE2x2 - -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) - -#endif - /*aggregate vectors*/ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - /* */ - /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ - xxperm vs1,vs1, permute_mask - - - xxmrghw vs2 ,vs1,vs0 - xxpermdi vs2,vs2,vs2,2 - xxmrghw vs3 ,vs0,vs1 -#if defined(TRMMKERNEL) - xvmulsp vs36, vs2, alpha_r - xvmulsp vs37, vs3, alpha_r -#else - xvmaddasp vs36, vs2, alpha_r - xvmaddasp vs37, vs3, alpha_r -#endif - /**** store last two words*/ - - - stxsd v4, 0(CO) - stxsd v5, 0(T1) - - addi CO,CO,8 - -.endm - -/*--------------------------- M=1 N=2 */ -.macro Zero2x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL2x1 - KERNEL2x1_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs2, vs37, vs35 - xvmulsp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP8(\Index,32) - addi \AREG, \AREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP2(\Index,8) -.endm - - -.macro SAVE2x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) -#endif - add T1, CO, LDC -#ifndef TRMMKERNEL - lxssp v5 , 0(T1) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 2x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 0(T1) - - addi CO,CO,4 - -.endm - - - -/****************************N=1 section*****************/ - -.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x16 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x16 - KERNEL1x16_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) - lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - xvmulsp vs2, vs28, vs8 - xvmulsp vs3, vs29, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP16(\Index,64) - -.endm - - - - -.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) - - lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) - lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) - lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) - lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) - - lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) - lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) - lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) - lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - xvmaddasp vs2, vs32, vs10 - xvmaddasp vs3, vs33, vs10 - - - xvmaddasp vs0, vs34, vs11 - xvmaddasp vs1, vs35, vs11 - xvmaddasp vs2, vs36, vs11 - xvmaddasp vs3, vs37, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP64(\Index,256) -.endif - -.endm - -.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) - lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) - lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - xvmaddasp vs2, vs28, vs8 - xvmaddasp vs3, vs29, vs8 - - - xvmaddasp vs0, vs16, vs9 - xvmaddasp vs1, vs17, vs9 - xvmaddasp vs2, vs18, vs9 - xvmaddasp vs3, vs19, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - - -.macro SAVE1x16 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) - lxv vs18, 32(CO) - lxv vs19, 48(CO) -#endif - - -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r - xvmulsp vs18, vs2, alpha_r - xvmulsp vs19, vs3, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r - xvmaddasp vs18, vs2, alpha_r - xvmaddasp vs19, vs3, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - stxv vs18, 32(CO) - stxv vs19, 48(CO) - - addi CO,CO,64 - -.endm - -/* M=8 N=1 */ - -.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x8 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x8 - KERNEL1x8_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 - xvmulsp vs1, vs27, vs8 - - -.else - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP8(\Index,32) - -.endm - - - - -.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) - - lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) - lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) - - lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) - lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - - xvmaddasp vs0, vs30, vs10 - xvmaddasp vs1, vs31, vs10 - - - xvmaddasp vs2, vs34, vs11 - xvmaddasp vs3, vs35, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP32(\Index,128) -.endif - -.endm - -.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs8 - - - xvmaddasp vs2, vs16, vs9 - xvmaddasp vs3, vs17, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - - -.macro SAVE1x8 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) - lxv vs17, 16(CO) -#endif - /* aggregate vs0 vs2 and vs1 vs3*/ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r - xvmulsp vs17, vs1, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r - xvmaddasp vs17, vs1, alpha_r -#endif - stxv vs16, 0(CO) - stxv vs17, 16(CO) - - addi CO,CO,32 - -.endm -/*M=4*/ - -.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - - -.macro Zero1x4 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2, vs2 - xxlxor vs3, vs3, vs3 -.endm - -.macro KERNEL1x4 - KERNEL1x4_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) - xscvdpspn vs36,vs36 - xxspltw vs8, vs36, 0 - lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) - - -.if \First==1 - xvmulsp vs0, vs26, vs8 -.else - xvmaddasp vs0, vs26, vs8 - - .endif - - addi \BREG, \BREG, DISP1(\Index,4) - addi \AREG, \AREG, DISP4(\Index,16) - -.endm - - - - -.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) - - lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) - - - xxspltw vs8, vs38, 3 - xxspltw vs9, vs38, 2 - - lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) - lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) - - - xxspltw vs10, vs38, 1 - xxspltw vs11, vs38, 0 - - - xvmaddasp vs0, vs26, vs8 - - xvmaddasp vs1, vs27, vs9 - - xvmaddasp vs2, vs30, vs10 - - - xvmaddasp vs3, vs31, vs11 - - - - -.if \IsLast==1 - addi \BREG, \BREG, DISP4(\Index,16) - addi \AREG, \AREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) - xxspltw vs8, vs36, 1 - xxspltw vs9, vs36, 0 - lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) - lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) - - - xvmaddasp vs0, vs26, vs8 - xvmaddasp vs1, vs27, vs9 - - -.if \IsLast==1 - addi \BREG, \BREG, DISP2(\Index,8) - addi \AREG, \AREG, DISP8(\Index,32) -.endif - -.endm - - -.macro SAVE1x4 - -#ifndef TRMMKERNEL - lxv vs16, 0(CO) -#endif - /* aggregate */ - xvaddsp vs0,vs0,vs2 - xvaddsp vs1,vs1,vs3 - xvaddsp vs0,vs1,vs0 -#if defined(TRMMKERNEL) - xvmulsp vs16, vs0, alpha_r -#else - xvmaddasp vs16, vs0, alpha_r -#endif - stxv vs16, 0(CO) - - addi CO,CO,16 - -.endm - -/* M=2 N=1*/ -.macro Zero1x2 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2,vs2,vs2 - xxlxor vs3,vs3,vs3 -.endm - -.macro KERNEL1x2 - KERNEL1x2_1 AO,BO, 0, 0,0,0 -.endm -.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone then will add it to batched ones - */ -.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs2, vs37, vs35 - xvmuldp vs3, vs37, vs36 - -.else - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - .endif - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - - - -.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) - - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xxmrglw vs5, vs26,vs26 - xxmrghw vs6, vs26,vs26 - - xvmaddasp vs0, vs8, vs5 - xvmaddasp vs1, vs10, vs6 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) - lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) - lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) - lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) - lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) - lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) - - - xsmaddadp vs2, vs37, vs35 - xsmaddadp vs3, vs37, vs36 - - xsmaddadp vs2, vs38, vs39 - xsmaddadp vs3, vs38, vs40 - - - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x2 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - lxssp v5 , 4(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors 1x2_4 */ - xxpermdi vs4,vs0,vs0,2 - xxpermdi vs5,vs1,vs1,2 - xvaddsp vs0,vs0,vs4 - xvaddsp vs1,vs1,vs5 - xvaddsp vs0,vs0,vs1 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs2,vs2,vs6 - xsadddp vs3,vs3,vs5 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs2, vs16 - xsmuldp vs37,vs3, vs16 - -#else - xsmaddadp vs36,vs2, vs16 - xsmaddadp vs37,vs3, vs16 -#endif - - stxssp v4, 0(CO) - stxssp v5, 4(CO) - - addi CO,CO,8 - -.endm -/*///////////////// N=1 M=1 //////////////////*/ -.macro Zero1x1 - xxlxor vs0, vs0, vs0 - xxlxor vs1, vs1, vs1 - xxlxor vs2, vs2,vs2 - xxlxor vs3,vs3,vs3 - xxlxor vs4,vs4,vs4 -.endm - -.macro KERNEL1x1 - KERNEL1x1_1 AO,BO, 1, 0,0,0 -.endm - -.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - -.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast -.endm - /* - we will calculate 1 alone ( FIRST==1 to zero vs4) - */ -.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index - - - lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) - lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) - - -.if \First==1 - xvmuldp vs4, vs37, vs35 - -.else - xsmaddadp vs4, vs37, vs35 - .endif - - addi \AREG, \AREG, DISP1(\Index,4) - addi \BREG, \BREG, DISP1(\Index,4) - -.endm - - -.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) - lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) - lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) - lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) - lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) - lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - xvmaddasp vs2, vs10, vs17 - xvmaddasp vs3, vs11, vs18 -.if \IsLast==1 - addi \AREG, \AREG, DISP16(\Index,64) - addi \BREG, \BREG, DISP16(\Index,64) -.endif - -.endm - -.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) - lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) - lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) - lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) - xvmaddasp vs0, vs8, vs26 - xvmaddasp vs1, vs9, vs16 - -.if \IsLast==1 - addi \AREG, \AREG, DISP8(\Index,32) - addi \BREG, \BREG, DISP8(\Index,32) -.endif - -.endm - - -.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) - lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs8, vs26 - - -.if \IsLast==1 - addi \AREG, \AREG, DISP4(\Index,16) - addi \BREG, \BREG, DISP4(\Index,16) -.endif - -.endm - -.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast - - lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) - lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) - - xvmaddasp vs0, vs36, vs37 - - addi \AREG, \AREG, DISP2(\Index,8) - addi \BREG, \BREG, DISP2(\Index,8) -.endm - - -.macro SAVE1x1 - -#ifndef TRMMKERNEL - lxssp v4 , 0(CO) - -#endif - - /*convert alpha_r for multiply*/ - xscvspdp vs16,alpha_r - - /*aggregate vectors */ - xvaddsp vs0,vs0,vs1 - xvaddsp vs2,vs2,vs3 - xvaddsp vs0,vs0,vs2 - - xxpermdi vs7,vs0,vs0,2 - xvaddsp vs0,vs0,vs7 -/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ - xscvspdp vs5, vs0 - xxspltw vs6, vs0, 1 - xscvspdp vs6,vs6 - xsadddp vs7,vs5,vs6 - xsadddp vs4,vs4,vs7 - - /**** store last two words*/ -#if defined(TRMMKERNEL) - xsmuldp vs36,vs4, vs16 - -#else - xsmaddadp vs36,vs4, vs16 -#endif - - stxssp v4, 0(CO) - - addi CO,CO,4 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 3 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 2 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 4 +#define DISP64(ind,disp) (ind*unit_size*64+disp) +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + + + +.macro KERNEL8x16_L1_L4 Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + +.macro LOAD8x16 OffsetA,OffsetB + + lxv vs24, (\OffsetB+0)(BO) + lxv vs28, (\OffsetB+16)(BO) + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + lxv vs0, (\OffsetA+0)(AO) + lxv vs1, (\OffsetA+16)(AO) + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + lxv vs2, (\OffsetA+32)(AO) + lxv vs3, (\OffsetA+48)(AO) + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endm + +.macro END8x16_NORMAL + END8x16 0, AO, BO, 64,32 +.endm + +.macro END8x16_WITHOUT_ADD + END8x16 0, AO,BO,0,0 +.endm + +.macro END8x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + xvmulsp vs50, vs2,vs28 + xvmulsp vs51, vs3,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + xvmulsp vs54, vs2,vs29 + xvmulsp vs55, vs3,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + xvmulsp vs58, vs2,vs30 + xvmulsp vs59, vs3,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + xvmulsp vs62, vs2,vs31 + xvmulsp vs63, vs3,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 + +.endif +.endm + +.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + +KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0 +KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete + +.endm + +.macro KERNEL8x16 First + + LOAD8x16 0,0 + END8x16 \First, AO, BO, 64,32 +.endm + +.macro LOAD8x16_2 + LOAD8x16_2O AO,BO, 0,0 +.endm + +.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB + lxv vs8, (\OffsetB)(\BREG) + lxv vs12, (16+\OffsetB)(\BREG) + lxv vs24, (32+\OffsetB)(\BREG) + lxv vs28, (32+16+\OffsetB)(\BREG) + lxv vs4, (0+\OffsetA)(\AREG) + lxv vs5, (16+\OffsetA)(\AREG) + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + lxv vs6, (32+\OffsetA)(\AREG) + lxv vs7, (48+\OffsetA)(\AREG) + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + lxv vs0, (64+\OffsetA)(\AREG) + lxv vs1, (64+16+\OffsetA)(\AREG) + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + lxv vs2, (64+32+\OffsetA)(\AREG) + lxv vs3, (64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endm + +.macro END8x16_2 + /*for load2 offset will be 128 and 64*/ + KERNEL8x16_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast + KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.if \Complete==0 + lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + xvmaddasp vs50, vs6,vs12 + xvmaddasp vs51, vs7,vs12 +.if \Complete==0 + lxv vs8, DISP16(\Index,\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + xvmaddasp vs58, vs6,vs14 + xvmaddasp vs59, vs7,vs14 +.if \Complete==0 + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask +.endif + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + xvmaddasp vs54, vs6,vs13 + xvmaddasp vs55, vs7,vs13 +.if \Complete==0 + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.endif + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + xvmaddasp vs62, vs6,vs15 + xvmaddasp vs63, vs7,vs15 +.if \Complete==0 + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 +.endif + +.if \Complete==0 + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) +.endif + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 +.if \Complete==0 + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) +.endif + + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + xvmaddasp vs50, vs2,vs28 + xvmaddasp vs51, vs3,vs28 +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + xvmaddasp vs58, vs2,vs30 + xvmaddasp vs59, vs3,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs54, vs2,vs29 + xvmaddasp vs55, vs3,vs29 +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + xvmaddasp vs62, vs2,vs31 + xvmaddasp vs63, vs3,vs31 +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 +.endif +.if \Complete==0 + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,\OffsetB) + addi \AREG, \AREG, DISP32(\Index,\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + + +.macro SAVE8x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + + + + /* permute to restore butterfly rank 1 updateto normal promoted one */ + /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */ + /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */ + /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */ + /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */ + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) +#endif + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 +#ifndef TRMMKERNEL + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) +#endif + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + + + +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 +#ifndef TRMMKERNEL + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 +#ifndef TRMMKERNEL + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + + + + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + + stxv vs32, 0(CO) + stxv vs33, 16(CO) +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + stxv vs34, 32(CO) + stxv vs35, 48(CO) +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T1) + stxv vs37, 16(T1) +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + stxv vs38, 32(T1) + stxv vs39, 48(T1) + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + + stxv vs40, 0(T2) + stxv vs41, 16(T2) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T2) + stxv vs43, 48(T2) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + stxv vs44, 0(T3) + stxv vs45, 16(T3) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + /*****the same with the second 8X8 ****/ + #ifndef TRMMKERNEL + lxv vs32, 0(T4) + lxv vs33, 16(T4) +#endif + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs34, 32(T4) + lxv vs35, 48(T4) +#endif + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 +#ifndef TRMMKERNEL + lxv vs36, 0(T5) + lxv vs37, 16(T5) +#endif + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 +#ifndef TRMMKERNEL + lxv vs38,32(T5) + lxv vs39, 48(T5) +#endif + + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 +#ifndef TRMMKERNEL + lxv vs40, 0(T6) + lxv vs41, 16(T6) +#endif + xxmrglw vs16, vs50, vs62 + xxmrglw vs18, vs54, vs58 +#ifndef TRMMKERNEL + lxv vs42, 32(T6) + lxv vs43, 48(T6) +#endif + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + xxmrghw vs4, vs54, vs58 + xxmrghw vs5, vs50, vs62 +#ifndef TRMMKERNEL + lxv vs44, 0(T7) + lxv vs45, 16(T7) +#endif + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs51, vs63 + xxmrglw vs26, vs55, vs59 +#ifndef TRMMKERNEL + lxv vs46, 32(T7) + lxv vs47, 48(T7) +#endif + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + xxmrghw vs30, vs55, vs59 + xxmrghw vs31, vs51, vs63 + + + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + #ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r +#endif + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + stxv vs32, 0(T4) + stxv vs33, 16(T4) + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + +#ifdef TRMMKERNEL + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r +#else + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r +#endif + stxv vs34, 32(T4) + stxv vs35, 48(T4) + +#ifdef TRMMKERNEL + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r +#else + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r +#endif + stxv vs36, 0(T5) + stxv vs37, 16(T5) + +#ifdef TRMMKERNEL + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + + + stxv vs38, 32(T5) + stxv vs39, 48(T5) + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r +#else + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r +#endif + stxv vs40, 0(T6) + stxv vs41, 16(T6) +#ifdef TRMMKERNEL + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r +#else + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r +#endif + stxv vs42, 32(T6) + stxv vs43, 48(T6) +#ifdef TRMMKERNEL + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r +#else + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r +#endif + + stxv vs44, 0(T7) + stxv vs45, 16(T7) +#ifdef TRMMKERNEL + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r +#endif + + stxv vs46, 32(T7) + stxv vs47, 48(T7) + + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + LOAD8x8 1 +.endm + +.macro LOAD8x8_0 + LOAD8x8 0 +.endm + +.macro KERNEL8x8_L1_L4 Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END8x8_NORMAL + END8x8 0, AO, BO, 32,32 +.endm + +.macro Zero8X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + +.endm + +.macro LOAD8x8 Zero + + lxv vs24, 0(BO) + lxv vs28, 16(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 +.endif +.endm + + +.macro END8x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.endm + +.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG) + lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG) + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 +.if \Complete==0 + lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG) +.endif + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 +.if \Complete==0 + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask +.endif + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + + +.if \Complete==0 + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP32(\Index,128) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endm + +.macro KERNEL8x8 First + + LOAD8x8 0 + END8x8 \First, AO, BO, 32,32 +.endm + +.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxperm vs14, vs12, permute_mask + xxpermdi vs9, vs8, vs8,2 + xxpermdi vs13, vs12, vs12,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + xxpermdi vs15, vs14, vs14,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + xvmulsp vs48, vs0,vs28 + xvmulsp vs49, vs1,vs28 + + xvmulsp vs52, vs0,vs29 + xvmulsp vs53, vs1,vs29 + + xvmulsp vs56, vs0,vs30 + xvmulsp vs57, vs1,vs30 + + xvmulsp vs60, vs0,vs31 + xvmulsp vs61, vs1,vs31 + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + xvmaddasp vs48, vs0,vs28 + xvmaddasp vs49, vs1,vs28 + + xvmaddasp vs52, vs0,vs29 + xvmaddasp vs53, vs1,vs29 + + xvmaddasp vs56, vs0,vs30 + xvmaddasp vs57, vs1,vs30 + + xvmaddasp vs60, vs0,vs31 + xvmaddasp vs61, vs1,vs31 + +.endif +.if \Complete==0 + lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxperm vs30, vs28, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs29, vs28, vs28,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + xxpermdi vs31, vs30, vs30,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + + xvmulsp vs48, vs4,vs12 + xvmulsp vs49, vs5,vs12 + + xvmulsp vs52, vs4,vs13 + xvmulsp vs53, vs5,vs13 + + xvmulsp vs56, vs4,vs14 + xvmulsp vs57, vs5,vs14 + + xvmulsp vs60, vs4,vs15 + xvmulsp vs61, vs5,vs15 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + xvmaddasp vs48, vs4,vs12 + xvmaddasp vs49, vs5,vs12 + + xvmaddasp vs52, vs4,vs13 + xvmaddasp vs53, vs5,vs13 + + xvmaddasp vs56, vs4,vs14 + xvmaddasp vs57, vs5,vs14 + + xvmaddasp vs60, vs4,vs15 + xvmaddasp vs61, vs5,vs15 + +.endif + +.endm + + +.macro SAVE8x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + add T4, T2, T10 + add T5, T3, T10 + + add T6, T4, T10 + add T7, T5, T10 + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + lxv vs50, 0(T4) + lxv vs51, 16(T4) + lxv vs54, 0(T5) + lxv vs55, 16(T5) + lxv vs58, 0(T6) + lxv vs59, 16(T6) + lxv vs62, 0(T7) + lxv vs63, 16(T7) +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + xxmrglw vs8, vs48, vs60 + xxmrglw vs10, vs52, vs56 + + xxmrghw vs1, vs48, vs60 + xxmrghw vs0, vs52, vs56 + stxv vs34, 0(CO) + stxv vs35, 16(CO) + xxmrglw vs12, vs49, vs61 + xxmrglw vs14, vs53, vs57 + stxv vs38, 0(T1) + stxv vs39, 16(T1) + xxmrghw vs2, vs53, vs57 + xxmrghw vs3, vs49, vs61 + stxv vs42, 0(T2) + stxv vs43, 16(T2) + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + stxv vs46, 0(T3) + stxv vs47, 16(T3) + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + + + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + #ifdef TRMMKERNEL + xvmulsp vs50, vs8, alpha_r + xvmulsp vs51, vs12, alpha_r + xvmulsp vs54, vs9, alpha_r + xvmulsp vs55, vs13, alpha_r + xvmulsp vs58, vs10, alpha_r + xvmulsp vs59, vs14, alpha_r + xvmulsp vs62, vs11, alpha_r + xvmulsp vs63, vs15, alpha_r +#else + xvmaddasp vs50, vs8, alpha_r + xvmaddasp vs51, vs12, alpha_r + xvmaddasp vs54, vs9, alpha_r + xvmaddasp vs55, vs13, alpha_r + xvmaddasp vs58, vs10, alpha_r + xvmaddasp vs59, vs14, alpha_r + xvmaddasp vs62, vs11, alpha_r + xvmaddasp vs63, vs15, alpha_r +#endif + + stxv vs50, 0(T4) + stxv vs51, 16(T4) + stxv vs54, 0(T5) + stxv vs55, 16(T5) + stxv vs58, 0(T6) + stxv vs59, 16(T6) + stxv vs62, 0(T7) + stxv vs63, 16(T7) + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + LOAD8x4 1 +.endm + +.macro LOAD8x4_0 + LOAD8x4 0 +.endm + +.macro KERNEL8x4_L1_L4 Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero8X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + +.endm + +.macro LOAD8x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + lxv vs25, 16(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 +.endif +.endm + +.macro END8x4_NORMAL + END8x4 0, AO, BO, 16,32 +.endm + +.macro END8x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.endif +.endm + +.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG) + lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP32(\Index,128) + +.endif +.endif + + +.endm + +.macro KERNEL8x4 First + LOAD8x4 0 + END8x4 \First, AO, BO, 16,32 +.endm + +.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + + xvmulsp vs48, vs25, vs0 + xvmulsp vs49, vs25, vs1 + xvmulsp vs50, vs25, vs2 + xvmulsp vs51, vs25, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + xvmaddasp vs48, vs25, vs0 + xvmaddasp vs49, vs25, vs1 + xvmaddasp vs50, vs25, vs2 + xvmaddasp vs51, vs25, vs3 +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG) + lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + xvmulsp vs48, vs27, vs4 + xvmulsp vs49, vs27, vs5 + xvmulsp vs50, vs27, vs6 + xvmulsp vs51, vs27, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + xvmaddasp vs48, vs27, vs4 + xvmaddasp vs49, vs27, vs5 + xvmaddasp vs50, vs27, vs6 + xvmaddasp vs51, vs27, vs7 +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,32+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + + +.macro SAVE8x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + add T4, T2, T10 + add T5, T3, T10 +#if !defined(TRMMKERNEL) + lxv vs40, 0(T4) + lxv vs41, 0(T5) +#endif + add T6, T4, T10 + add T7, T5, T10 +#if !defined(TRMMKERNEL) + lxv vs42, 0(T6) + lxv vs43, 0(T7) +#endif + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + + xxmrglw vs0, vs51,vs48 + xxmrglw vs1, vs50,vs49 + xxmrglw vs4, vs48,vs51 + xxmrglw vs5, vs49,vs50 + + xxmrghw vs2, vs51,vs48 + xxmrghw vs3, vs50,vs49 + xxmrghw vs6, vs48,vs51 + xxmrghw vs7, vs49,vs50 + + xxmrgld vs28, vs1, vs0 + xxmrghd vs29,vs5,vs4 + + xxmrgld vs30, vs2, vs3 + xxmrghd vs31,vs6,vs7 +#if defined(TRMMKERNEL) + + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r + xvmulsp vs40, vs28, alpha_r + xvmulsp vs41, vs29, alpha_r + xvmulsp vs42, vs30, alpha_r + xvmulsp vs43, vs31, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + xvmaddasp vs40, vs28, alpha_r + xvmaddasp vs41, vs29, alpha_r + xvmaddasp vs42, vs30, alpha_r + xvmaddasp vs43, vs31, alpha_r +#endif + + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + stxv vs40, 0(T4) + stxv vs41, 0(T5) + stxv vs42, 0(T6) + stxv vs43, 0(T7) + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + + +.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero8x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + +.macro KERNEL8x2 + KERNEL8x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP8(\Index,32) + +.endm + +.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs26, vs9 + xvmulsp vs3, vs27, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs1, vs29, vs10 + xvmulsp vs2, vs28, vs11 + xvmulsp vs3, vs29, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs26, vs9 + xvmaddasp vs3, vs27, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs1, vs29, vs10 + xvmaddasp vs2, vs28, vs11 + xvmaddasp vs3, vs29, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE8x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + lxssp v8,0(T4) + lxssp v9,4(T4) + + lxssp v10,0(T5) + lxssp v11,4(T5) + + lxssp v12,0(T6) + lxssp v13,4(T6) + + lxssp v14,0(T7) + lxssp v15,4(T7) +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + xscvspdp vs9, vs3 + xxspltw vs10, vs3, 1 + xxspltw vs11, vs3, 2 + xxspltw vs12, vs3, 3 + xscvspdp vs10,vs10 + xscvspdp vs11,vs11 + xscvspdp vs12,vs12 + + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 + + + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + xsmuldp vs40,vs12, vs4 + xsmuldp vs41,vs31, vs4 + + xsmuldp vs42,vs11, vs4 + xsmuldp vs43,vs30, vs4 + + xsmuldp vs44,vs10, vs4 + xsmuldp vs45,vs29, vs4 + + xsmuldp vs46,vs9, vs4 + xsmuldp vs47,vs28, vs4 +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + xsmaddadp vs40,vs12, vs4 + xsmaddadp vs41,vs31, vs4 + + xsmaddadp vs42,vs11, vs4 + xsmaddadp vs43,vs30, vs4 + + xsmaddadp vs44,vs10, vs4 + xsmaddadp vs45,vs29, vs4 + + xsmaddadp vs46,vs9, vs4 + xsmaddadp vs47,vs28, vs4 +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + stxssp v8,0(T4) + stxssp v9,4(T4) + + stxssp v10,0(T5) + stxssp v11,4(T5) + + stxssp v12,0(T6) + stxssp v13,4(T6) + + stxssp v14,0(T7) + stxssp v15,4(T7) + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ +.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero8x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + +.macro KERNEL8x1 + KERNEL8x1_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_2 + KERNEL8x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL8x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL8x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs27, 16(\BREG) + lxv vs28, 32(\BREG) + lxv vs29, 48(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 64 +.endm + +.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG) + lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG) + lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG) + lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG) + lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG) + lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG) + lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG) + lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs1, vs29, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs1, vs31, vs10 + xvmulsp vs0, vs32, vs11 + xvmulsp vs1, vs33, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs1, vs29, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs0, vs32, vs11 + xvmaddasp vs1, vs33, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP32(\Index,128) +.endif +.endm + +.macro SAVE8x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + add T4, T2, T10 + add T5, T3, T10 + add T6, T4, T10 + add T7, T5, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) + lxssp v8,0(T4) + lxssp v10,0(T5) + lxssp v12,0(T6) + lxssp v14,0(T7) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + xscvspdp vs28, vs1 + xxspltw vs29, vs1, 1 + xxspltw vs30, vs1, 2 + xxspltw vs31, vs1, 3 + xscvspdp vs29,vs29 + xscvspdp vs30,vs30 + xscvspdp vs31,vs31 +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 + xsmuldp vs40,vs31, vs4 + xsmuldp vs42,vs30, vs4 + xsmuldp vs44,vs29, vs4 + xsmuldp vs46,vs28, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 + xsmaddadp vs40,vs31, vs4 + xsmaddadp vs42,vs30, vs4 + xsmaddadp vs44,vs29, vs4 + xsmaddadp vs46,vs28, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + stxssp v8,0(T4) + stxssp v10,0(T5) + stxssp v12,0(T6) + stxssp v14,0(T7) + addi CO,CO,4 +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + LOAD4x16 1 +.endm + +.macro LOAD4x16_0 + LOAD4x16 0 +.endm + +.macro KERNEL4x16_L1_L4 Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X16 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + +.macro LOAD4x16 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + lxv vs2, 32(AO) + lxv vs3, 48(AO) + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + +.endif +.endm + +.macro END4x16_NORMAL + END4x16 0, AO, BO, 64,16 +.endm + +.macro END4x16 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + +.endif +.endm + +.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG) + lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG) + lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG) + lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG) + lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG) + lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG) + lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP64(\Index,256) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endm + +.macro KERNEL4x16 First + + LOAD4x16 0 + END4x16 \First, AO, BO, 64,16 +.endm + +.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + xvmulsp vs34, vs2,vs24 + xvmulsp vs35, vs3,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + xvmulsp vs38, vs2,vs25 + xvmulsp vs39, vs3,vs25 +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + xvmaddasp vs34, vs2,vs24 + xvmaddasp vs35, vs3,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + xvmaddasp vs38, vs2,vs25 + xvmaddasp vs39, vs3,vs25 +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + xvmulsp vs42, vs2,vs26 + xvmulsp vs43, vs3,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + xvmulsp vs46, vs2,vs27 + xvmulsp vs47, vs3,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + xvmaddasp vs42, vs2,vs26 + xvmaddasp vs43, vs3,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + xvmaddasp vs46, vs2,vs27 + xvmaddasp vs47, vs3,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG) + lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG) + lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,64+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + xvmulsp vs34, vs6,vs8 + xvmulsp vs35, vs7,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + xvmulsp vs38, vs6,vs9 + xvmulsp vs39, vs7,vs9 +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + xvmaddasp vs34, vs6,vs8 + xvmaddasp vs35, vs7,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + xvmaddasp vs38, vs6,vs9 + xvmaddasp vs39, vs7,vs9 +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + xvmulsp vs42, vs6,vs10 + xvmulsp vs43, vs7,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + xvmulsp vs46, vs6,vs11 + xvmulsp vs47, vs7,vs11 + + + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + xvmaddasp vs42, vs6,vs10 + xvmaddasp vs43, vs7,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + xvmaddasp vs46, vs6,vs11 + xvmaddasp vs47, vs7,vs11 + + + +.endif + +.endm + + +.macro SAVE4x16 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxmrglw vs16, vs34, vs46 + xxmrglw vs18, vs38, vs42 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxmrghw vs4, vs38, vs42 + xxmrghw vs5, vs34, vs46 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxmrglw vs24, vs35, vs47 + xxmrglw vs26, vs39, vs43 + + xxlor vs17, vs16, vs16 + xxlor vs19, vs18, vs18 + + xxmrghw vs30, vs39, vs43 + xxmrghw vs31, vs35, vs47 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + +#ifndef TRMMKERNEL + lxv vs32, 0(CO) + lxv vs33, 16(CO) + lxv vs34, 32(CO) + lxv vs35, 48(CO) +#endif + xxlor vs25, vs24, vs24 + xxlor vs27, vs26, vs26 + +#ifndef TRMMKERNEL + lxv vs36, 0(T1) + lxv vs37, 16(T1) + lxv vs38, 32(T1) + lxv vs39, 48(T1) +#endif +#ifndef TRMMKERNEL + lxv vs40, 0(T2) + lxv vs41, 16(T2) + lxv vs42, 32(T2) + lxv vs43, 48(T2) +#endif +#ifndef TRMMKERNEL + lxv vs44, 0(T3) + lxv vs45, 16(T3) + lxv vs46, 32(T3) + lxv vs47, 48(T3) +#endif + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + xxperm vs16, vs4, save_permute_1 + xxperm vs18, vs5, save_permute_1 + + xxperm vs17, vs4, save_permute_2 + xxperm vs19, vs5, save_permute_2 + + xxperm vs24, vs30, save_permute_1 + xxperm vs26, vs31, save_permute_1 + + xxperm vs25, vs30, save_permute_2 + xxperm vs27, vs31, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs32, vs8, alpha_r + xvmulsp vs33, vs12, alpha_r + xvmulsp vs34, vs16, alpha_r + xvmulsp vs35, vs24, alpha_r + xvmulsp vs36, vs9, alpha_r + xvmulsp vs37, vs13, alpha_r + xvmulsp vs38, vs17, alpha_r + xvmulsp vs39, vs25, alpha_r +#else + xvmaddasp vs32, vs8, alpha_r + xvmaddasp vs33, vs12, alpha_r + xvmaddasp vs34, vs16, alpha_r + xvmaddasp vs35, vs24, alpha_r + xvmaddasp vs36, vs9, alpha_r + xvmaddasp vs37, vs13, alpha_r + xvmaddasp vs38, vs17, alpha_r + xvmaddasp vs39, vs25, alpha_r +#endif + + + +#ifdef TRMMKERNEL + xvmulsp vs40, vs10, alpha_r + xvmulsp vs41, vs14, alpha_r + xvmulsp vs42, vs18, alpha_r + xvmulsp vs43, vs26, alpha_r + xvmulsp vs44, vs11, alpha_r + xvmulsp vs45, vs15, alpha_r + xvmulsp vs46, vs19, alpha_r + xvmulsp vs47, vs27, alpha_r +#else + + xvmaddasp vs40, vs10, alpha_r + xvmaddasp vs41, vs14, alpha_r + xvmaddasp vs42, vs18, alpha_r + xvmaddasp vs43, vs26, alpha_r + xvmaddasp vs44, vs11, alpha_r + xvmaddasp vs45, vs15, alpha_r + xvmaddasp vs46, vs19, alpha_r + xvmaddasp vs47, vs27, alpha_r + +#endif + + stxv vs32, 0(CO) + stxv vs33, 16(CO) + stxv vs34, 32(CO) + stxv vs35, 48(CO) + + stxv vs36, 0(T1) + stxv vs37, 16(T1) + stxv vs38, 32(T1) + stxv vs39, 48(T1) + + stxv vs40, 0(T2) + stxv vs41, 16(T2) + stxv vs42, 32(T2) + stxv vs43, 48(T2) + stxv vs44, 0(T3) + stxv vs45, 16(T3) + stxv vs46, 32(T3) + stxv vs47, 48(T3) + + addi CO,CO,64 + + +.endm + + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + LOAD4x8 1 +.endm + +.macro LOAD4x8_0 + LOAD4x8 0 +.endm + +.macro KERNEL4x8_L1_L4 Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro END4x8_NORMAL + END4x8 0, AO, BO, 32,16 +.endm + +.macro Zero4X8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endm + +.macro LOAD4x8 Zero + + lxv vs24, 0(BO) + lxv vs0, 0(AO) + lxv vs1, 16(AO) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xxpermdi vs27, vs26, vs26,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + +.endif +.endm + + +.macro END4x8 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.endm + +.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + + lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + + xxpermdi vs27, vs26, vs26,2 + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + + lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG) + + lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG) + lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 + + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + + xxpermdi vs11, vs10, vs10,2 + + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + + +.if \Complete==0 + lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG) + + lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG) + lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 + +.endif +.if \IsLast==1 +.if \Complete==1 + + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA) +.else + + addi \BREG, \BREG, DISP16(\Index,64) + addi \AREG, \AREG, DISP32(\Index,128) +.endif +.endif + + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif + + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + + + +.endm + +.macro KERNEL4x8 First + + LOAD4x8 0 + END4x8 \First, AO, BO, 32,16 +.endm + +.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) + + xxperm vs10, vs8, permute_mask + xxpermdi vs9, vs8, vs8,2 +.if \First==1 + xvmulsp vs32, vs0,vs24 + xvmulsp vs33, vs1,vs24 + + xvmulsp vs36, vs0,vs25 + xvmulsp vs37, vs1,vs25 + +.else + xvmaddasp vs32, vs0,vs24 + xvmaddasp vs33, vs1,vs24 + + xvmaddasp vs36, vs0,vs25 + xvmaddasp vs37, vs1,vs25 + +.endif + + xxpermdi vs11, vs10, vs10,2 + +.if \First==1 + xvmulsp vs40, vs0,vs26 + xvmulsp vs41, vs1,vs26 + + xvmulsp vs44, vs0,vs27 + xvmulsp vs45, vs1,vs27 + + +.else + xvmaddasp vs40, vs0,vs26 + xvmaddasp vs41, vs1,vs26 + + xvmaddasp vs44, vs0,vs27 + xvmaddasp vs45, vs1,vs27 + + +.endif +.if \Complete==0 + lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG) + + lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG) + + xxperm vs26, vs24, permute_mask + xxpermdi vs25, vs24, vs24,2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + addi \AREG, \AREG, DISP16(\Index,32+\OffsetA) + +.else + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif +.endif + +.if \First==1 + xvmulsp vs32, vs4,vs8 + xvmulsp vs33, vs5,vs8 + + xvmulsp vs36, vs4,vs9 + xvmulsp vs37, vs5,vs9 + +.else + xvmaddasp vs32, vs4,vs8 + xvmaddasp vs33, vs5,vs8 + + xvmaddasp vs36, vs4,vs9 + xvmaddasp vs37, vs5,vs9 + +.endif + +.if \Complete==0 + xxpermdi vs27, vs26, vs26,2 + +.endif +.if \First==1 + xvmulsp vs40, vs4,vs10 + xvmulsp vs41, vs5,vs10 + + xvmulsp vs44, vs4,vs11 + xvmulsp vs45, vs5,vs11 + +.else + xvmaddasp vs40, vs4,vs10 + xvmaddasp vs41, vs5,vs10 + + xvmaddasp vs44, vs4,vs11 + xvmaddasp vs45, vs5,vs11 + +.endif + +.endm + + +.macro SAVE4x8 + + slwi T10, LDC , 1 + add T1, CO, LDC + + add T2, CO, T10 + add T3, T1, T10 + + + +#ifndef TRMMKERNEL + lxv vs34, 0(CO) + lxv vs35, 16(CO) + lxv vs38, 0(T1) + lxv vs39, 16(T1) + lxv vs42, 0(T2) + lxv vs43, 16(T2) + lxv vs46, 0(T3) + lxv vs47, 16(T3) + + +#endif + + xxmrglw vs8, vs32, vs44 + xxmrglw vs10, vs36, vs40 + + xxmrghw vs1, vs32, vs44 + xxmrghw vs0, vs36, vs40 + + xxmrglw vs12, vs33, vs45 + xxmrglw vs14, vs37, vs41 + + xxmrghw vs2, vs37, vs41 + xxmrghw vs3, vs33, vs45 + + xxlor vs9, vs8, vs8 + xxlor vs11, vs10, vs10 + + xxlor vs13, vs12, vs12 + xxlor vs15, vs14, vs14 + + xxperm vs8, vs0, save_permute_1 + xxperm vs10, vs1, save_permute_1 + xxperm vs9, vs0, save_permute_2 + xxperm vs11, vs1, save_permute_2 + + xxperm vs12, vs2, save_permute_1 + xxperm vs14, vs3, save_permute_1 + + xxperm vs13, vs2, save_permute_2 + xxperm vs15, vs3, save_permute_2 + + + /* multiply add normal way */ + +#ifdef TRMMKERNEL + xvmulsp vs34, vs8, alpha_r + xvmulsp vs35, vs12, alpha_r + xvmulsp vs38, vs9, alpha_r + xvmulsp vs39, vs13, alpha_r + xvmulsp vs42, vs10, alpha_r + xvmulsp vs43, vs14, alpha_r + xvmulsp vs46, vs11, alpha_r + xvmulsp vs47, vs15, alpha_r +#else + xvmaddasp vs34, vs8, alpha_r + xvmaddasp vs35, vs12, alpha_r + xvmaddasp vs38, vs9, alpha_r + xvmaddasp vs39, vs13, alpha_r + xvmaddasp vs42, vs10, alpha_r + xvmaddasp vs43, vs14, alpha_r + xvmaddasp vs46, vs11, alpha_r + xvmaddasp vs47, vs15, alpha_r +#endif + + + stxv vs34, 0(CO) + stxv vs35, 16(CO) + stxv vs38, 0(T1) + stxv vs39, 16(T1) + stxv vs42, 0(T2) + stxv vs43, 16(T2) + stxv vs46, 0(T3) + stxv vs47, 16(T3) + + + addi CO,CO,32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + LOAD4x4 1 +.endm + +.macro LOAD4x4_0 + LOAD4x4 0 +.endm + +.macro KERNEL4x4_L1_L4 Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm +.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0 +.endm + +.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast + KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1 +.endm + +.macro Zero4X4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + +.macro LOAD4x4 Zero + + lxv vs0, 0(AO) + lxv vs24, 0(BO) + + + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + +.if \Zero==1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endif +.endm + +.macro END4x4_NORMAL + END4x4 0, AO, BO, 16,16 +.endm + +.macro END4x4 First, AREG, BREG, OffsetA, OffsetB + +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.endif +.endm + +.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + + lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 + + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 + + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + + +.if \Complete==0 + + lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG) + lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA) + addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB) + +.else + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) + +.endif +.endif + + +.endm + +.macro KERNEL4x4 First + LOAD4x4 0 + END4x4 \First, AO, BO, 16,16 +.endm + +.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete + + lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + + xxperm vs6, vs4, permute_mask + xxpermdi vs5, vs4, vs4,2 + xxpermdi vs7, vs6, vs6,2 +.if \First==1 + xvmulsp vs32, vs24, vs0 + xvmulsp vs33, vs24, vs1 + xvmulsp vs34, vs24, vs2 + xvmulsp vs35, vs24, vs3 + +.else + xvmaddasp vs32, vs24, vs0 + xvmaddasp vs33, vs24, vs1 + xvmaddasp vs34, vs24, vs2 + xvmaddasp vs35, vs24, vs3 + +.endif + +.if \Complete==0 + + lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG) + lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG) + + xxperm vs2, vs0, permute_mask + xxpermdi vs1, vs0, vs0,2 + xxpermdi vs3, vs2, vs2,2 +.endif + +.if \First==1 + xvmulsp vs32, vs26, vs4 + xvmulsp vs33, vs26, vs5 + xvmulsp vs34, vs26, vs6 + xvmulsp vs35, vs26, vs7 + + +.else + xvmaddasp vs32, vs26, vs4 + xvmaddasp vs33, vs26, vs5 + xvmaddasp vs34, vs26, vs6 + xvmaddasp vs35, vs26, vs7 + +.endif + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,16+\OffsetA) + addi \BREG, \BREG, DISP8(\Index,16+\OffsetB) + +.else + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) + +.endif +.endif + + +.endm + + +.macro SAVE4x4 + slwi T10, LDC , 1 + add T1, CO, LDC +#if !defined(TRMMKERNEL) + lxv vs36, 0(CO) + lxv vs37, 0(T1) +#endif + add T2, CO, T10 + add T3, T1, T10 +#if !defined(TRMMKERNEL) + lxv vs38, 0(T2) + lxv vs39, 0(T3) +#endif + + xxmrglw vs0, vs35,vs32 + xxmrglw vs1, vs34,vs33 + xxmrglw vs4, vs32,vs35 + xxmrglw vs5, vs33,vs34 + + + xxmrghw vs2, vs35,vs32 + xxmrghw vs3, vs34,vs33 + xxmrghw vs6, vs32,vs35 + xxmrghw vs7, vs33,vs34 + + xxmrgld vs24, vs1, vs0 + xxmrghd vs25,vs5,vs4 + + xxmrgld vs26, vs2, vs3 + xxmrghd vs27,vs6,vs7 + + #if defined(TRMMKERNEL) + xvmulsp vs36, vs24, alpha_r + xvmulsp vs37, vs25, alpha_r + xvmulsp vs38, vs26, alpha_r + xvmulsp vs39, vs27, alpha_r +#else + xvmaddasp vs36, vs24, alpha_r + xvmaddasp vs37, vs25, alpha_r + xvmaddasp vs38, vs26, alpha_r + xvmaddasp vs39, vs27, alpha_r + #endif + stxv vs36, 0(CO) + stxv vs37, 0(T1) + stxv vs38, 0(T2) + stxv vs39, 0(T3) + + + + addi CO,CO,16 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + + +.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + + +.macro Zero4x2 + xxlxor vs0, vs0, vs0 + xxlxor vs2, vs2, vs2 + +.endm + +.macro KERNEL4x2 + KERNEL4x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP4(\Index,16) + +.endm + +.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG) + xxspltw vs8, vs4, 2 + xxspltw vs9, vs4, 3 + xxspltw vs10, vs4, 0 + xxspltw vs11, vs4, 1 + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs2, vs26, vs9 + + xvmulsp vs0, vs28, vs10 + xvmulsp vs2, vs28, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs2, vs26, vs9 + + xvmaddasp vs0, vs28, vs10 + xvmaddasp vs2, vs28, vs11 + .endif + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE4x2 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v1,4(CO) + + lxssp v2,0(T1) + lxssp v3,4(T1) + + lxssp v4,0(T2) + lxssp v5,4(T2) + + lxssp v6,0(T3) + lxssp v7,4(T3) + + +#endif + xscvspdp vs5, vs2 + xxspltw vs6, vs2, 1 + xxspltw vs7, vs2, 2 + xxspltw vs8, vs2, 3 + xscvspdp vs6,vs6 + xscvspdp vs7,vs7 + xscvspdp vs8,vs8 + + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs8, vs4 + xsmuldp vs33,vs27, vs4 + + xsmuldp vs34,vs7, vs4 + xsmuldp vs35,vs26, vs4 + + xsmuldp vs36,vs6, vs4 + xsmuldp vs37,vs25, vs4 + + xsmuldp vs38,vs5, vs4 + xsmuldp vs39,vs24, vs4 + + +#else + xsmaddadp vs32,vs8, vs4 + xsmaddadp vs33,vs27, vs4 + + xsmaddadp vs34,vs7, vs4 + xsmaddadp vs35,vs26, vs4 + + xsmaddadp vs36,vs6, vs4 + xsmaddadp vs37,vs25, vs4 + + xsmaddadp vs38,vs5, vs4 + xsmaddadp vs39,vs24, vs4 + + +#endif + + stxssp v0,0(CO) + stxssp v1,4(CO) + + stxssp v2,0(T1) + stxssp v3,4(T1) + + stxssp v4,0(T2) + stxssp v5,4(T2) + + stxssp v6,0(T3) + stxssp v7,4(T3) + + + + + addi CO,CO,8 +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ +.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro Zero4x1 + xxlxor vs0, vs0, vs0 +.endm + +.macro KERNEL4x1 + KERNEL4x1_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_2 + KERNEL4x1_2_1 AO,BO, 0 +.endm + +.macro KERNEL4x1_1 AREG,BREG,First + lxvwsx vs8, 0, \AREG + lxv vs26, 0(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + .endif + addi \AREG, \AREG, 4 + addi \BREG, \BREG, 16 +.endm + +.macro KERNEL4x1_2_1 AREG,BREG,First + lxsd v4, 0(\AREG) + lxv vs26, 0(\BREG) + lxv vs28, 16(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + .endif + addi \AREG, \AREG, 8 + addi \BREG, \BREG, 32 +.endm + +.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast + lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG) + xxspltw vs8, vs4, 3 + xxspltw vs9, vs4, 2 + xxspltw vs10, vs4, 1 + xxspltw vs11, vs4, 0 + lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG) + lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG) + lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG) + lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG) +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs0, vs28, vs9 + xvmulsp vs0, vs30, vs10 + xvmulsp vs0, vs32, vs11 +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs0, vs28, vs9 + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs0, vs32, vs11 + .endif +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP16(\Index,64) +.endif +.endm + +.macro SAVE4x1 + slwi T10, LDC , 1 + add T1, CO, LDC + add T2, CO, T10 + add T3, T1, T10 + /*convert alpha_r for multiply*/ + xscvspdp vs4,alpha_r +/* v0 corresponds to vs32, do not forget*/ +#if !defined(TRMMKERNEL) + lxssp v0,0(CO) + lxssp v2,0(T1) + lxssp v4,0(T2) + lxssp v6,0(T3) +#endif + xscvspdp vs24, vs0 + xxspltw vs25, vs0, 1 + xxspltw vs26, vs0, 2 + xxspltw vs27, vs0, 3 + xscvspdp vs25,vs25 + xscvspdp vs26,vs26 + xscvspdp vs27,vs27 + +#if defined(TRMMKERNEL) + xsmuldp vs32,vs27, vs4 + xsmuldp vs34,vs26, vs4 + xsmuldp vs36,vs25, vs4 + xsmuldp vs38,vs24, vs4 +#else + xsmaddadp vs32,vs27, vs4 + xsmaddadp vs34,vs26, vs4 + xsmaddadp vs36,vs25, vs4 + xsmaddadp vs38,vs24, vs4 +#endif + stxssp v0,0(CO) + stxssp v2,0(T1) + stxssp v4,0(T2) + stxssp v6,0(T3) + addi CO,CO,4 +.endm + +/****************************N=2 section*****************/ + +.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 +.endm + +.macro KERNEL2x16 + KERNEL2x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + xvmulsp vs6, vs28, vs9 + xvmulsp vs7, vs29, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs2, vs32, vs12 + xvmaddasp vs3, vs33, vs12 + + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + xvmaddasp vs6, vs32, vs13 + xvmaddasp vs7, vs33, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs2, vs36, vs14 + xvmaddasp vs3, vs37, vs14 + + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + xvmaddasp vs6, vs36, vs15 + xvmaddasp vs7, vs37, vs15 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + xvmaddasp vs6, vs28, vs9 + xvmaddasp vs7, vs29, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs2, vs18, vs10 + xvmaddasp vs3, vs19, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + xvmaddasp vs6, vs18, vs11 + xvmaddasp vs7, vs19, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE2x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + lxv vs28, 32(T1) + lxv vs29, 48(T1) +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r + xvmulsp vs28, vs6, alpha_r + xvmulsp vs29, vs7, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r + xvmaddasp vs28, vs6, alpha_r + xvmaddasp vs29, vs7, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + stxv vs28, 32(T1) + stxv vs29, 48(T1) + + addi CO,CO,64 + +.endm + +/* M=8 N=2 */ + +.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero2x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x8 + KERNEL2x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + xvmulsp vs4, vs26, vs9 + xvmulsp vs5, vs27, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs31, vs12 + xvmaddasp vs4, vs30, vs13 + xvmaddasp vs5, vs31, vs13 + + xvmaddasp vs0, vs34, vs14 + xvmaddasp vs1, vs35, vs14 + xvmaddasp vs4, vs34, vs15 + xvmaddasp vs5, vs35, vs15 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + xvmaddasp vs4, vs26, vs9 + xvmaddasp vs5, vs27, vs9 + + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs17, vs10 + + xvmaddasp vs4, vs16, vs11 + xvmaddasp vs5, vs17, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE2x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + lxv vs27, 16(T1) + +#endif + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs26, vs4, alpha_r + xvmulsp vs27, vs5, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs26, vs4, alpha_r + xvmaddasp vs27, vs5, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + + stxv vs26, 0(T1) + stxv vs27, 16(T1) + + addi CO,CO,32 + +.endm + + +/*M=4*/ + + +.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + /* we will aggregate on save vs0 +vs4 vs11+vs5 */ +.macro Zero2x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +.endm + +.macro KERNEL2x4 + KERNEL2x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs26, vs9 + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG) + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + xxspltw vs12, vs39, 3 + xxspltw vs13, vs39, 2 + xxspltw vs14, vs39, 1 + xxspltw vs15, vs39, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + + + xvmaddasp vs0, vs30, vs12 + xvmaddasp vs1, vs30, vs13 + xvmaddasp vs4, vs34, vs14 + xvmaddasp vs5, vs34, vs15 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 3 + xxspltw vs9, vs36, 2 + xxspltw vs10, vs36, 1 + xxspltw vs11, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG) + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs4, vs16, vs10 + xvmaddasp vs5, vs16, vs11 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE2x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs26, 0(T1) + +#endif + /*aggregate vectors*/ + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs26, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs26, vs1, alpha_r +#endif + + stxv vs16, 0(CO) + stxv vs26, 0(T1) + + addi CO,CO,16 + +.endm + + +/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */ +.macro SWITCH_PERMUTE_INNER + xxpermdi permute_mask, permute_mask, permute_mask,2 +.endm + +.macro Zero2x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + SWITCH_PERMUTE_INNER +.endm + +.macro KERNEL2x2 + KERNEL2x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxperm vs9, vs36, permute_mask + lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs37, vs36 + xvmulsp vs1, vs37, vs9 + +.else + xvmaddasp vs0, vs37, vs36 + xvmaddasp vs1, vs37, vs9 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP2(\Index,8) + +.endm + + + + +.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + xxperm vs11, vs10, permute_mask + + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + xvmaddasp vs0, vs16, vs10 + xvmaddasp vs1, vs16, vs11 + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + +.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + + xxperm vs9, vs8, permute_mask + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs26, vs9 + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP4(\Index,16) +.endif +.endm + + +.macro SAVE2x2 + +#ifndef TRMMKERNEL + lxsd v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v5 , 0(T1) + +#endif + /*aggregate vectors*/ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + /* */ + /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */ + xxperm vs1,vs1, permute_mask + + + xxmrghw vs2 ,vs1,vs0 + xxpermdi vs2,vs2,vs2,2 + xxmrghw vs3 ,vs0,vs1 +#if defined(TRMMKERNEL) + xvmulsp vs36, vs2, alpha_r + xvmulsp vs37, vs3, alpha_r +#else + xvmaddasp vs36, vs2, alpha_r + xvmaddasp vs37, vs3, alpha_r +#endif + /**** store last two words*/ + + + stxsd v4, 0(CO) + stxsd v5, 0(T1) + + addi CO,CO,8 + +.endm + +/*--------------------------- M=1 N=2 */ +.macro Zero2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL2x1 + KERNEL2x1_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs2, vs37, vs35 + xvmulsp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP8(\Index,32) + addi \AREG, \AREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP2(\Index,8) +.endm + + +.macro SAVE2x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) +#endif + add T1, CO, LDC +#ifndef TRMMKERNEL + lxssp v5 , 0(T1) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 2x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 0(T1) + + addi CO,CO,4 + +.endm + + + +/****************************N=1 section*****************/ + +.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x16 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x16 + KERNEL1x16_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG) + lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + xvmulsp vs2, vs28, vs8 + xvmulsp vs3, vs29, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP16(\Index,64) + +.endm + + + + +.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG) + + lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG) + lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG) + lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG) + lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG) + + lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG) + lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG) + lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG) + lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + xvmaddasp vs2, vs32, vs10 + xvmaddasp vs3, vs33, vs10 + + + xvmaddasp vs0, vs34, vs11 + xvmaddasp vs1, vs35, vs11 + xvmaddasp vs2, vs36, vs11 + xvmaddasp vs3, vs37, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP64(\Index,256) +.endif + +.endm + +.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG) + lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG) + lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG) + lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + xvmaddasp vs2, vs28, vs8 + xvmaddasp vs3, vs29, vs8 + + + xvmaddasp vs0, vs16, vs9 + xvmaddasp vs1, vs17, vs9 + xvmaddasp vs2, vs18, vs9 + xvmaddasp vs3, vs19, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + + +.macro SAVE1x16 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) + lxv vs18, 32(CO) + lxv vs19, 48(CO) +#endif + + +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r + xvmulsp vs18, vs2, alpha_r + xvmulsp vs19, vs3, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r + xvmaddasp vs18, vs2, alpha_r + xvmaddasp vs19, vs3, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + stxv vs18, 32(CO) + stxv vs19, 48(CO) + + addi CO,CO,64 + +.endm + +/* M=8 N=1 */ + +.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x8 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x8 + KERNEL1x8_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 + xvmulsp vs1, vs27, vs8 + + +.else + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP8(\Index,32) + +.endm + + + + +.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG) + + lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG) + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG) + lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG) + + lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG) + lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG) + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + + xvmaddasp vs0, vs30, vs10 + xvmaddasp vs1, vs31, vs10 + + + xvmaddasp vs2, vs34, vs11 + xvmaddasp vs3, vs35, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP32(\Index,128) +.endif + +.endm + +.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs8 + + + xvmaddasp vs2, vs16, vs9 + xvmaddasp vs3, vs17, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + + +.macro SAVE1x8 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) + lxv vs17, 16(CO) +#endif + /* aggregate vs0 vs2 and vs1 vs3*/ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r + xvmulsp vs17, vs1, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r + xvmaddasp vs17, vs1, alpha_r +#endif + stxv vs16, 0(CO) + stxv vs17, 16(CO) + + addi CO,CO,32 + +.endm +/*M=4*/ + +.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + + +.macro Zero1x4 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 +.endm + +.macro KERNEL1x4 + KERNEL1x4_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG) + xscvdpspn vs36,vs36 + xxspltw vs8, vs36, 0 + lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG) + + +.if \First==1 + xvmulsp vs0, vs26, vs8 +.else + xvmaddasp vs0, vs26, vs8 + + .endif + + addi \BREG, \BREG, DISP1(\Index,4) + addi \AREG, \AREG, DISP4(\Index,16) + +.endm + + + + +.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG) + + lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG) + + + xxspltw vs8, vs38, 3 + xxspltw vs9, vs38, 2 + + lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG) + lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG) + + + xxspltw vs10, vs38, 1 + xxspltw vs11, vs38, 0 + + + xvmaddasp vs0, vs26, vs8 + + xvmaddasp vs1, vs27, vs9 + + xvmaddasp vs2, vs30, vs10 + + + xvmaddasp vs3, vs31, vs11 + + + + +.if \IsLast==1 + addi \BREG, \BREG, DISP4(\Index,16) + addi \AREG, \AREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG) + xxspltw vs8, vs36, 1 + xxspltw vs9, vs36, 0 + lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG) + lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG) + + + xvmaddasp vs0, vs26, vs8 + xvmaddasp vs1, vs27, vs9 + + +.if \IsLast==1 + addi \BREG, \BREG, DISP2(\Index,8) + addi \AREG, \AREG, DISP8(\Index,32) +.endif + +.endm + + +.macro SAVE1x4 + +#ifndef TRMMKERNEL + lxv vs16, 0(CO) +#endif + /* aggregate */ + xvaddsp vs0,vs0,vs2 + xvaddsp vs1,vs1,vs3 + xvaddsp vs0,vs1,vs0 +#if defined(TRMMKERNEL) + xvmulsp vs16, vs0, alpha_r +#else + xvmaddasp vs16, vs0, alpha_r +#endif + stxv vs16, 0(CO) + + addi CO,CO,16 + +.endm + +/* M=2 N=1*/ +.macro Zero1x2 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2,vs2,vs2 + xxlxor vs3,vs3,vs3 +.endm + +.macro KERNEL1x2 + KERNEL1x2_1 AO,BO, 0, 0,0,0 +.endm +.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone then will add it to batched ones + */ +.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs2, vs37, vs35 + xvmuldp vs3, vs37, vs36 + +.else + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + .endif + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + + + +.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG) + + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xxmrglw vs5, vs26,vs26 + xxmrghw vs6, vs26,vs26 + + xvmaddasp vs0, vs8, vs5 + xvmaddasp vs1, vs10, vs6 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG) + lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG) + lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG) + lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG) + lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG) + lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG) + + + xsmaddadp vs2, vs37, vs35 + xsmaddadp vs3, vs37, vs36 + + xsmaddadp vs2, vs38, vs39 + xsmaddadp vs3, vs38, vs40 + + + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x2 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + lxssp v5 , 4(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors 1x2_4 */ + xxpermdi vs4,vs0,vs0,2 + xxpermdi vs5,vs1,vs1,2 + xvaddsp vs0,vs0,vs4 + xvaddsp vs1,vs1,vs5 + xvaddsp vs0,vs0,vs1 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs2,vs2,vs6 + xsadddp vs3,vs3,vs5 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs2, vs16 + xsmuldp vs37,vs3, vs16 + +#else + xsmaddadp vs36,vs2, vs16 + xsmaddadp vs37,vs3, vs16 +#endif + + stxssp v4, 0(CO) + stxssp v5, 4(CO) + + addi CO,CO,8 + +.endm +/*///////////////// N=1 M=1 //////////////////*/ +.macro Zero1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2,vs2 + xxlxor vs3,vs3,vs3 + xxlxor vs4,vs4,vs4 +.endm + +.macro KERNEL1x1 + KERNEL1x1_1 AO,BO, 1, 0,0,0 +.endm + +.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + +.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast +.endm + /* + we will calculate 1 alone ( FIRST==1 to zero vs4) + */ +.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index + + + lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG) + lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG) + + +.if \First==1 + xvmuldp vs4, vs37, vs35 + +.else + xsmaddadp vs4, vs37, vs35 + .endif + + addi \AREG, \AREG, DISP1(\Index,4) + addi \BREG, \BREG, DISP1(\Index,4) + +.endm + + +.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG) + lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG) + lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG) + lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG) + lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG) + lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + xvmaddasp vs2, vs10, vs17 + xvmaddasp vs3, vs11, vs18 +.if \IsLast==1 + addi \AREG, \AREG, DISP16(\Index,64) + addi \BREG, \BREG, DISP16(\Index,64) +.endif + +.endm + +.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG) + lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG) + lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG) + lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG) + xvmaddasp vs0, vs8, vs26 + xvmaddasp vs1, vs9, vs16 + +.if \IsLast==1 + addi \AREG, \AREG, DISP8(\Index,32) + addi \BREG, \BREG, DISP8(\Index,32) +.endif + +.endm + + +.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG) + lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs8, vs26 + + +.if \IsLast==1 + addi \AREG, \AREG, DISP4(\Index,16) + addi \BREG, \BREG, DISP4(\Index,16) +.endif + +.endm + +.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast + + lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG) + lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG) + + xvmaddasp vs0, vs36, vs37 + + addi \AREG, \AREG, DISP2(\Index,8) + addi \BREG, \BREG, DISP2(\Index,8) +.endm + + +.macro SAVE1x1 + +#ifndef TRMMKERNEL + lxssp v4 , 0(CO) + +#endif + + /*convert alpha_r for multiply*/ + xscvspdp vs16,alpha_r + + /*aggregate vectors */ + xvaddsp vs0,vs0,vs1 + xvaddsp vs2,vs2,vs3 + xvaddsp vs0,vs0,vs2 + + xxpermdi vs7,vs0,vs0,2 + xvaddsp vs0,vs0,vs7 +/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/ + xscvspdp vs5, vs0 + xxspltw vs6, vs0, 1 + xscvspdp vs6,vs6 + xsadddp vs7,vs5,vs6 + xsadddp vs4,vs4,vs7 + + /**** store last two words*/ +#if defined(TRMMKERNEL) + xsmuldp vs36,vs4, vs16 + +#else + xsmaddadp vs36,vs4, vs16 +#endif + + stxssp v4, 0(CO) + + addi CO,CO,4 + +.endm + + + + +/****************************TRMM POINTER REFRESH MACROSES*************************/ + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 4 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 3 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 2 + .endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif + +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif + +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + + #endif + + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 5dfb18f5b9..f5c1ba729e 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -1,470 +1,470 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_n.c" - -#else - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float v_x4 = {x4,x4,x4,x4}; - __vector float v_x5 = {x5,x5,x5,x5}; - __vector float v_x6 = {x6,x6,x6,x6}; - __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i++) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i++ ) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i] =vy; - } - -} - -static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - } - -} - - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); -} - -#endif - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" + +#else + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float v_x4 = {x4,x4,x4,x4}; + __vector float v_x5 = {x5,x5,x5,x5}; + __vector float v_x6 = {x6,x6,x6,x6}; + __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i++) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i++ ) + { + register __vector float vy=v_y[i]; + vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + v_y[i] =vy; + } + +} + +static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + } + +} + + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i++ ) + { + v_y[i] += v_x0 * va0[i] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + +#endif + diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c index 64696236ae..0edb79129b 100644 --- a/kernel/power/sgemv_n_8.c +++ b/kernel/power/sgemv_n_8.c @@ -1,514 +1,514 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could provide barebone for switching to inline assembly -*/ - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - register __vector float v_x0 = {x0,x0,x0,x0}; - register __vector float v_x1 = {x1,x1,x1,x1}; - register __vector float v_x2 = {x2,x2,x2,x2}; - register __vector float v_x3 = {x3,x3,x3,x3}; - register __vector float v_x4 = {x4,x4,x4,x4}; - register __vector float v_x5 = {x5,x5,x5,x5}; - register __vector float v_x6 = {x6,x6,x6,x6}; - register __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i+=2) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float vb0_1=vb0[i] ; - register __vector float vb0_2=vb0[i+1] ; - register __vector float vb1_1=vb1[i] ; - register __vector float vb1_2=vb1[i+1] ; - register __vector float vb2_1=vb2[i] ; - register __vector float vb2_2=vb2[i+1] ; - register __vector float vb3_1=vb3[i] ; - register __vector float vb3_2=vb3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i+=2 ) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } - -} - - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 7 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - - if ( m3 & 4 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - if ( lda == 4 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; - temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; - - temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; - temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; - temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; - temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; - - a_ptr += 16; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0] ; - a_ptr +=4; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - y_ptr += inc_y; - y_ptr[0] += alpha * temp3; - y_ptr += inc_y; - a += 4; - } - - - if ( m3 & 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - a += 2; - } - - if ( m3 & 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - - - } - - - return(0); -} - - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could provide barebone for switching to inline assembly +*/ + +#include "common.h" + +#define NBMAX 4096 + +static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; + FLOAT x0,x1,x2,x3,x4,x5,x6,x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4 ; + b1 = a1 + lda4 ; + b2 = a2 + lda4 ; + b3 = a3 + lda4 ; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + x4 = xo[4] * *alpha; + x5 = xo[5] * *alpha; + x6 = xo[6] * *alpha; + x7 = xo[7] * *alpha; + __vector float* va0 = (__vector float*)a0; + __vector float* va1 = (__vector float*)a1; + __vector float* va2 = (__vector float*)a2; + __vector float* va3 = (__vector float*)a3; + __vector float* vb0 = (__vector float*)b0; + __vector float* vb1 = (__vector float*)b1; + __vector float* vb2 = (__vector float*)b2; + __vector float* vb3 = (__vector float*)b3; + + register __vector float v_x0 = {x0,x0,x0,x0}; + register __vector float v_x1 = {x1,x1,x1,x1}; + register __vector float v_x2 = {x2,x2,x2,x2}; + register __vector float v_x3 = {x3,x3,x3,x3}; + register __vector float v_x4 = {x4,x4,x4,x4}; + register __vector float v_x5 = {x5,x5,x5,x5}; + register __vector float v_x6 = {x6,x6,x6,x6}; + register __vector float v_x7 = {x7,x7,x7,x7}; + __vector float* v_y =(__vector float*)y; + + for ( i=0; i< n/4; i+=2) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float vb0_1=vb0[i] ; + register __vector float vb0_2=vb0[i+1] ; + register __vector float vb1_1=vb1[i] ; + register __vector float vb1_2=vb1[i+1] ; + register __vector float vb2_1=vb2[i] ; + register __vector float vb2_2=vb2[i+1] ; + register __vector float vb3_1=vb3[i] ; + register __vector float vb3_2=vb3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float v_x2 = {x2,x2,x2,x2}; + __vector float v_x3 = {x3,x3,x3,x3}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + __vector float* va2 = (__vector float*)ap[2]; + __vector float* va3 = (__vector float*)ap[3]; + + for ( i=0; i< n/4; i+=2 ) + { + register __vector float vy_1=v_y[i]; + register __vector float vy_2=v_y[i+1]; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; + vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; + v_y[i] =vy_1; + v_y[i+1] =vy_2; + } + +} + +static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0,x1; + x0 = x[0] * *alpha; + x1 = x[1] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float v_x1 = {x1,x1,x1,x1}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap[0]; + __vector float* va1 = (__vector float*)ap[1]; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } + +} + + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0 ; + x0 = x[0] * *alpha; + __vector float v_x0 = {x0,x0,x0,x0}; + __vector float* v_y =(__vector float*)y; + __vector float* va0 = (__vector float*)ap; + + for ( i=0; i< n/4; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 3 ; + n2 = n & 7 ; + } + else + { + n1 = n >> 2 ; + n2 = n & 3 ; + + } + + m3 = m & 7 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*4); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + + + if ( n2 & 4 ) + { + sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + + if ( m3 & 4 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + if ( lda == 4 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; + temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; + + temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; + temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; + temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; + temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; + + a_ptr += 16; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0] ; + a_ptr +=4; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + temp3 += a_ptr[3] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + y_ptr += inc_y; + y_ptr[0] += alpha * temp3; + y_ptr += inc_y; + a += 4; + } + + + if ( m3 & 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + a += 2; + } + + if ( m3 & 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + + + } + + + return(0); +} + + diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 62c517a9d6..c3fc8e77a1 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -1,484 +1,484 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ -#if !defined(__VEC__) || !defined(__ALTIVEC__) -#include "../arm/gemv_t.c" - -#else - -#include "common.h" - -#define NBMAX 2048 - -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 3 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - aj += 3; - } - - } else { - - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr += inc_y; - aj += lda; - } - - } - - } - return (0); - } - - if (m3 == 2) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - return (0); - - } - - FLOAT xtemp = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - } - - return (0); - -} - -#endif +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else + +#include "common.h" + +#define NBMAX 2048 + +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i ++) { + temp0 += v_x[i] * va0[i] ; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + +#endif diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index b905121629..1ee7c8aebb 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -1,508 +1,508 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could be used as base for switching to inline assembly -*/ - -#include "common.h" -#include -#define NBMAX 4096 - -#include - -static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i +=2) { - register __vector float vx1=v_x[i] ; - register __vector float vx2=v_x[i+1] ; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float va4_1=va4[i] ; - register __vector float va4_2=va4[i+1] ; - register __vector float va5_1=va5[i] ; - register __vector float va5_2=va5[i+1] ; - register __vector float va6_1=va6[i] ; - register __vector float va6_2=va6[i+1] ; - register __vector float va7_1=va7[i] ; - register __vector float va7_2=va7[i+1] ; - temp0 += vx1* va0_1 + vx2 * va0_2; - temp1 += vx1* va1_1 + vx2 * va1_2; - temp2 += vx1* va2_1 + vx2 * va2_2; - temp3 += vx1* va3_1 + vx2 * va3_2; - temp4 += vx1* va4_1 + vx2 * va4_2; - temp5 += vx1* va5_1 + vx2 * va5_2; - temp6 += vx1* va6_1 + vx2 * va6_2; - temp7 += vx1* va7_1 + vx2 * va7_2; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 7; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 & 4) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp3 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 4 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; - y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; - y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; - aj += 16; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - aj += 4; - } - - } else if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; - y_ptr += inc_y; - aj += lda; - } - - } - if (m3==4) return (0); - a_ptr += 4; - } - - if (m3 & 2 ) { - - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - if (m3==2) return (0); - a_ptr += 2; - } - if (m3 & 1) { - - FLOAT xtemp = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - - } - a_ptr += 1; - } - return (0); - -} - +/*************************************************************************** +Copyright (c) 2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +/****Note*** +UnUsed kernel +This kernel works. But it was not competitive enough to be added in production +It could be used and tested in future or could be used as base for switching to inline assembly +*/ + +#include "common.h" +#include +#define NBMAX 4096 + +#include + +static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + register __vector float temp4 = {0,0,0,0}; + register __vector float temp5 = {0,0,0,0}; + register __vector float temp6 = {0,0,0,0}; + register __vector float temp7 = {0,0,0,0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector float*) a0; + va1 = (__vector float*) a1; + va2 = (__vector float*) a2; + va3 = (__vector float*) a3; + va4 = (__vector float*) a4; + va5 = (__vector float*) a5; + va6 = (__vector float*) a6; + va7 = (__vector float*) a7; + v_x = (__vector float*) x; + + + for (i = 0; i < n/4; i +=2) { + register __vector float vx1=v_x[i] ; + register __vector float vx2=v_x[i+1] ; + register __vector float va0_1=va0[i] ; + register __vector float va0_2=va0[i+1] ; + register __vector float va1_1=va1[i] ; + register __vector float va1_2=va1[i+1] ; + register __vector float va2_1=va2[i] ; + register __vector float va2_2=va2[i+1] ; + register __vector float va3_1=va3[i] ; + register __vector float va3_2=va3[i+1] ; + register __vector float va4_1=va4[i] ; + register __vector float va4_2=va4[i+1] ; + register __vector float va5_1=va5[i] ; + register __vector float va5_2=va5[i+1] ; + register __vector float va6_1=va6[i] ; + register __vector float va6_2=va6[i+1] ; + register __vector float va7_1=va7[i] ; + register __vector float va7_2=va7[i+1] ; + temp0 += vx1* va0_1 + vx2 * va0_2; + temp1 += vx1* va1_1 + vx2 * va1_2; + temp2 += vx1* va2_1 + vx2 * va2_2; + temp3 += vx1* va3_1 + vx2 * va3_2; + temp4 += vx1* va4_1 + vx2 * va4_2; + temp5 += vx1* va5_1 + vx2 * va5_2; + temp6 += vx1* va6_1 + vx2 * va6_2; + temp7 += vx1* va7_1 + vx2 * va7_2; + } + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + +} + + +static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* va2 = (__vector float*) a2; + __vector float* va3 = (__vector float*) a3; + __vector float* v_x = (__vector float*) x; + register __vector float temp0 = {0,0,0,0}; + register __vector float temp1 = {0,0,0,0}; + register __vector float temp2 = {0,0,0,0}; + register __vector float temp3 = {0,0,0,0}; + + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; + } + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + +} + + +static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector float* va0 = (__vector float*) a0; + __vector float* va1 = (__vector float*) a1; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + __vector float temp1 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); +} + +static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector float* va0 = (__vector float*) a0; + __vector float* v_x = (__vector float*) x; + __vector float temp0 = {0,0,0,0}; + for (i = 0; i < n / 4; i +=2) { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; + } + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + +} + + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 7; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; + + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 & 4) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp3 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 4 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; + y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; + y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; + aj += 16; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; + aj += 4; + } + + } else if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; + y_ptr += inc_y; + aj += lda; + } + + } + if (m3==4) return (0); + a_ptr += 4; + } + + if (m3 & 2 ) { + + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + if (m3==2) return (0); + a_ptr += 2; + } + if (m3 & 1) { + + FLOAT xtemp = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + + } + a_ptr += 1; + } + return (0); + +} + diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S index d1e60da6c5..f9320d5168 100644 --- a/kernel/power/zgemm_kernel_power9.S +++ b/kernel/power/zgemm_kernel_power9.S @@ -1,245 +1,245 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - -#define LOAD ld - -#define STACKSIZE 512 - -#define FZERO 312+192(SP) - -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ - -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - - -#define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 - -#define VECSAVE r11 - -#define FRAMEPOINTER r12 - -#define T10 r14 - -#define L r15 -#define T8 r16 -#define T5 r17 -#define T2 r19 -#define TEMP_REG r20 -#define T6 r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T7 r27 -#define T3 r28 -#define T4 r29 - -#define PRE r30 -#define T1 r31 - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - mflr r0 - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - xxspltd alpha_r,vs1,0 /*copy from register f1 */ - xxspltd alpha_i,vs2,0 /*copy from register f2 */ - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - std r0, FLINK_SAVE(SP) - - -#if defined(linux) || defined(__FreeBSD__) - ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) -#endif - - -#ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) -#endif -#endif - - -#include "zgemm_macros_power9.S" - - - - slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li r0, 0 - - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegdp alpha_r,alpha_r - xvnegdp alpha_i,alpha_i -#endif - .align 4 - -#include "zgemm_logic_power9.S" - -L999: - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power9.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power9.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE #endif \ No newline at end of file diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S index fe5d8ade24..850b41aff0 100644 --- a/kernel/power/zgemm_logic_power9.S +++ b/kernel/power/zgemm_logic_power9.S @@ -1,1891 +1,1891 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define MY_ALIGN .align 3 -b ZGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -ZGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 -ZGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_L2 256,64,31,0 - KERNEL2x8_L2 256,64,32,0 - KERNEL2x8_L2 256,64,33,0 - KERNEL2x8_L2 256,64,34,0 - KERNEL2x8_L2 256,64,35,0 - KERNEL2x8_L2 256,64,36,0 - KERNEL2x8_L2 256,64,37,0 - KERNEL2x8_L2 256,64,38,0 - KERNEL2x8_L2 256,64,39,0 - KERNEL2x8_L2 256,64,40,0 - KERNEL2x8_L2 256,64,41,0 - KERNEL2x8_L2 256,64,42,0 - KERNEL2x8_L2 256,64,43,0 - KERNEL2x8_L2 256,64,44,0 - KERNEL2x8_L2 256,64,45,0 - KERNEL2x8_L2 256,64,46,0 - KERNEL2x8_L2 256,64,47,0 - KERNEL2x8_L2 256,64,48,0 - KERNEL2x8_L2 256,64,49,0 - KERNEL2x8_L2 256,64,50,0 - KERNEL2x8_L2 256,64,51,0 - KERNEL2x8_L2 256,64,52,0 - KERNEL2x8_L2 256,64,53,0 - KERNEL2x8_L2 256,64,54,0 - KERNEL2x8_L2 256,64,55,0 - KERNEL2x8_L2 256,64,56,0 - KERNEL2x8_L2 256,64,57,0 - KERNEL2x8_L2 256,64,58,0 - KERNEL2x8_L2 256,64,59,0 - KERNEL2x8_L2 256,64,60,0 - KERNEL2x8_L2 256,64,61,0 - KERNEL2x8_L2 256,64,62,0 - KERNEL2x8_L2 256,64,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -ZGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_E2 256,64,31,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_E2 256,64,15,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_E2 256,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -ZGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,0,0 -ZGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_L2 128,64,7,0 - KERNEL2x4_L2 128,64,8,0 - KERNEL2x4_L2 128,64,9,0 - KERNEL2x4_L2 128,64,10,0 - KERNEL2x4_L2 128,64,11,0 - KERNEL2x4_L2 128,64,12,0 - KERNEL2x4_L2 128,64,13,0 - KERNEL2x4_L2 128,64,14,0 - KERNEL2x4_L2 128,64,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -ZGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_E2 128,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_E2 128,64,3,1 - blr - - -ZGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -ZGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,0,0 -ZGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_L2 64,64,7,0 - KERNEL2x2_L2 64,64,8,0 - KERNEL2x2_L2 64,64,9,0 - KERNEL2x2_L2 64,64,10,0 - KERNEL2x2_L2 64,64,11,0 - KERNEL2x2_L2 64,64,12,0 - KERNEL2x2_L2 64,64,13,0 - KERNEL2x2_L2 64,64,14,0 - KERNEL2x2_L2 64,64,15,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN - - -ZGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -ZGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_E2 64,64,7,1 - blr - MY_ALIGN -ZGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_E2 64,64,3,1 - blr - - -ZGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -ZGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,0,0 -ZGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_L2 32,64,7,0 - KERNEL2x1_L2 32,64,8,0 - KERNEL2x1_L2 32,64,9,0 - KERNEL2x1_L2 32,64,10,0 - KERNEL2x1_L2 32,64,11,0 - KERNEL2x1_L2 32,64,12,0 - KERNEL2x1_L2 32,64,13,0 - KERNEL2x1_L2 32,64,14,0 - KERNEL2x1_L2 32,64,15,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -ZGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_E2 32,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_E2 32,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -ZGEMM_L2: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 1 - ble ZGEMM_L2_END - - -ZGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - - -ZGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8O 128,32 - END2x8_WITHOUT_ADD - LOAD2x8_2O 256, 64 - mtctr T8 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-256 - LOAD2x8_2O 256,64 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - MY_ALIGN - - -ZGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_L2 256,64, 1,0 - KERNEL2x8_L2 256,64, 2,0 - KERNEL2x8_E2 256,64, 3,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_E2 256,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 256,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 - - -ZGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN - - -ZGEMM_L2x8_END: -/*----------------------------------------*/ - - -ZGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble ZGEMM_L2x4_SUB0 - bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 - - -ZGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4O 64,32 - END2x4_WITHOUT_ADD - LOAD2x4_2O 128, 64 - mtctr T8 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD2x4_2O 128,64 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x4_SUB2_4 - bl ZGEMM_2x4_L8_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 128,64, 0,0 - KERNEL2x4_E2 128,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 128,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 - - -ZGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -ZGEMM_L2x4_END: -/*----------------------------------------*/ - - -ZGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble ZGEMM_L2x2_SUB0 - bl ZGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 - - -ZGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2O 32,32 - END2x2_WITHOUT_ADD - LOAD2x2_2O 64, 64 - mtctr T8 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD2x2_2O 64,64 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x2_SUB2_8 - bl ZGEMM_2x2_L16_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x2_SUB2_4 - bl ZGEMM_2x2_L8_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 64,64, 0,0 - KERNEL2x2_E2 64,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 64,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 - - -ZGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -ZGEMM_L2x2_END: -/*----------------------------------------*/ - - -ZGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble ZGEMM_L2x1_SUB0 - bl ZGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 - - -ZGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1O 16,32 - END2x1_WITHOUT_ADD - LOAD2x1_2O 32, 64 - mtctr T8 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD2x1_2O 32,64 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x1_SUB2_8 - bl ZGEMM_2x1_L16_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x1_SUB2_4 - bl ZGEMM_2x1_L8_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 32,64, 0,0 - KERNEL2x1_E2 32,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 32,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 - - -ZGEMM_L2x1_SAVE: -/*----------------------------------------*/ - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -ZGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - bgt ZGEMM_L2_BEGIN - - -ZGEMM_L2_END: - -b ZGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -ZGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 -ZGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_L2 256,32,31,0 - KERNEL1x8_L2 256,32,32,0 - KERNEL1x8_L2 256,32,33,0 - KERNEL1x8_L2 256,32,34,0 - KERNEL1x8_L2 256,32,35,0 - KERNEL1x8_L2 256,32,36,0 - KERNEL1x8_L2 256,32,37,0 - KERNEL1x8_L2 256,32,38,0 - KERNEL1x8_L2 256,32,39,0 - KERNEL1x8_L2 256,32,40,0 - KERNEL1x8_L2 256,32,41,0 - KERNEL1x8_L2 256,32,42,0 - KERNEL1x8_L2 256,32,43,0 - KERNEL1x8_L2 256,32,44,0 - KERNEL1x8_L2 256,32,45,0 - KERNEL1x8_L2 256,32,46,0 - KERNEL1x8_L2 256,32,47,0 - KERNEL1x8_L2 256,32,48,0 - KERNEL1x8_L2 256,32,49,0 - KERNEL1x8_L2 256,32,50,0 - KERNEL1x8_L2 256,32,51,0 - KERNEL1x8_L2 256,32,52,0 - KERNEL1x8_L2 256,32,53,0 - KERNEL1x8_L2 256,32,54,0 - KERNEL1x8_L2 256,32,55,0 - KERNEL1x8_L2 256,32,56,0 - KERNEL1x8_L2 256,32,57,0 - KERNEL1x8_L2 256,32,58,0 - KERNEL1x8_L2 256,32,59,0 - KERNEL1x8_L2 256,32,60,0 - KERNEL1x8_L2 256,32,61,0 - KERNEL1x8_L2 256,32,62,0 - KERNEL1x8_L2 256,32,63,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -ZGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_E2 256,32,31,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_E2 256,32,15,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_E2 256,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN - - -ZGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,0,0 - - -ZGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_L2 128,32,7,0 - KERNEL1x4_L2 128,32,8,0 - KERNEL1x4_L2 128,32,9,0 - KERNEL1x4_L2 128,32,10,0 - KERNEL1x4_L2 128,32,11,0 - KERNEL1x4_L2 128,32,12,0 - KERNEL1x4_L2 128,32,13,0 - KERNEL1x4_L2 128,32,14,0 - KERNEL1x4_L2 128,32,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN - - -ZGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -ZGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_E2 128,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_E2 128,32,3,1 - blr - - -ZGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN - - -ZGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,0,0 - - -ZGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_L2 64,32,7,0 - KERNEL1x2_L2 64,32,8,0 - KERNEL1x2_L2 64,32,9,0 - KERNEL1x2_L2 64,32,10,0 - KERNEL1x2_L2 64,32,11,0 - KERNEL1x2_L2 64,32,12,0 - KERNEL1x2_L2 64,32,13,0 - KERNEL1x2_L2 64,32,14,0 - KERNEL1x2_L2 64,32,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN - - -ZGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN - - -ZGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_E2 64,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_E2 64,32,3,1 - blr - - -ZGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN - - -ZGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,0,0 - - -ZGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_L2 32,32,7,0 - KERNEL1x1_L2 32,32,8,0 - KERNEL1x1_L2 32,32,9,0 - KERNEL1x1_L2 32,32,10,0 - KERNEL1x1_L2 32,32,11,0 - KERNEL1x1_L2 32,32,12,0 - KERNEL1x1_L2 32,32,13,0 - KERNEL1x1_L2 32,32,14,0 - KERNEL1x1_L2 32,32,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN - - -ZGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - MY_ALIGN - - -ZGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_E2 32,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_E2 32,32,3,1 - blr - - -/*----------------------N1 BEGINS---------*/ -ZGEMM_L1: -/*----------------------------------------*/ - andi. T1, N, 1 - ble ZGEMM_L1_END - -ZGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - bl ZGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 - - -ZGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8O 128,16 - END1x8_WITHOUT_ADD - LOAD1x8_2O 256, 32 - mtctr T8 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-256 - LOAD1x8_2O 256,32 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - MY_ALIGN - - -ZGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L1x8_SUB2_32 - bl ZGEMM_1x8_L64_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L1x8_SUB2_16 - bl ZGEMM_1x8_L32_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x8_SUB2_8 - bl ZGEMM_1x8_L16_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_L2 256,32, 1,0 - KERNEL1x8_L2 256,32, 2,0 - KERNEL1x8_E2 256,32, 3,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_E2 256,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 256,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - - -ZGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt ZGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - b ZGEMM_L1x4_BEGIN - MY_ALIGN - - -ZGEMM_L1x8_END: -/*----------------------------------------*/ - - -ZGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - bl ZGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 - - -ZGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4O 64,16 - END1x4_WITHOUT_ADD - LOAD1x4_2O 128, 32 - mtctr T8 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD1x4_2O 128,32 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x4_SUB2_8 - bl ZGEMM_1x4_L16_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x4_SUB2_4 - bl ZGEMM_1x4_L8_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 128,32, 0,0 - KERNEL1x4_E2 128,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 128,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 - - -ZGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -ZGEMM_L1x4_END: -/*----------------------------------------*/ - - -ZGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - bl ZGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 - - -ZGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2O 32,16 - END1x2_WITHOUT_ADD - LOAD1x2_2O 64, 32 - mtctr T8 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD1x2_2O 64,32 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x2_SUB2_8 - bl ZGEMM_1x2_L16_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x2_SUB2_4 - bl ZGEMM_1x2_L8_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 64,32, 0,0 - KERNEL1x2_E2 64,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 64,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 - - -ZGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -ZGEMM_L1x2_END: -/*----------------------------------------*/ - - -ZGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - bl ZGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 - - -ZGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1O 16,16 - END1x1_WITHOUT_ADD - LOAD1x1_2O 32, 32 - mtctr T8 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD1x1_2O 32,32 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x1_SUB2_8 - bl ZGEMM_1x1_L16_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x1_SUB2_4 - bl ZGEMM_1x1_L8_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 32,32, 0,0 - KERNEL1x1_E2 32,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 32,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 - - -ZGEMM_L1x1_SAVE: -/*----------------------------------------*/ - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -ZGEMM_L1x1_END: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - - -ZGEMM_L1_END: -/*----------------------------------------*/ +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_L2 256,64,31,0 + KERNEL2x8_L2 256,64,32,0 + KERNEL2x8_L2 256,64,33,0 + KERNEL2x8_L2 256,64,34,0 + KERNEL2x8_L2 256,64,35,0 + KERNEL2x8_L2 256,64,36,0 + KERNEL2x8_L2 256,64,37,0 + KERNEL2x8_L2 256,64,38,0 + KERNEL2x8_L2 256,64,39,0 + KERNEL2x8_L2 256,64,40,0 + KERNEL2x8_L2 256,64,41,0 + KERNEL2x8_L2 256,64,42,0 + KERNEL2x8_L2 256,64,43,0 + KERNEL2x8_L2 256,64,44,0 + KERNEL2x8_L2 256,64,45,0 + KERNEL2x8_L2 256,64,46,0 + KERNEL2x8_L2 256,64,47,0 + KERNEL2x8_L2 256,64,48,0 + KERNEL2x8_L2 256,64,49,0 + KERNEL2x8_L2 256,64,50,0 + KERNEL2x8_L2 256,64,51,0 + KERNEL2x8_L2 256,64,52,0 + KERNEL2x8_L2 256,64,53,0 + KERNEL2x8_L2 256,64,54,0 + KERNEL2x8_L2 256,64,55,0 + KERNEL2x8_L2 256,64,56,0 + KERNEL2x8_L2 256,64,57,0 + KERNEL2x8_L2 256,64,58,0 + KERNEL2x8_L2 256,64,59,0 + KERNEL2x8_L2 256,64,60,0 + KERNEL2x8_L2 256,64,61,0 + KERNEL2x8_L2 256,64,62,0 + KERNEL2x8_L2 256,64,63,1 + bdnz ZGEMM_L2x8_LOOP + MY_ALIGN +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +ZGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_L2 256,64,15,0 + KERNEL2x8_L2 256,64,16,0 + KERNEL2x8_L2 256,64,17,0 + KERNEL2x8_L2 256,64,18,0 + KERNEL2x8_L2 256,64,19,0 + KERNEL2x8_L2 256,64,20,0 + KERNEL2x8_L2 256,64,21,0 + KERNEL2x8_L2 256,64,22,0 + KERNEL2x8_L2 256,64,23,0 + KERNEL2x8_L2 256,64,24,0 + KERNEL2x8_L2 256,64,25,0 + KERNEL2x8_L2 256,64,26,0 + KERNEL2x8_L2 256,64,27,0 + KERNEL2x8_L2 256,64,28,0 + KERNEL2x8_L2 256,64,29,0 + KERNEL2x8_L2 256,64,30,0 + KERNEL2x8_E2 256,64,31,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_L2 256,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 256,64,8,0 + KERNEL2x8_L2 256,64,9,0 + KERNEL2x8_L2 256,64,10,0 + KERNEL2x8_L2 256,64,11,0 + dcbt BO, T4 + KERNEL2x8_L2 256,64,12,0 + KERNEL2x8_L2 256,64,13,0 + KERNEL2x8_L2 256,64,14,0 + KERNEL2x8_E2 256,64,15,1 + blr + MY_ALIGN + + +ZGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 256,64,0,0 + KERNEL2x8_L2 256,64,1,0 + dcbt AO, T2 + KERNEL2x8_L2 256,64,2,0 + KERNEL2x8_L2 256,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 256,64,4,0 + KERNEL2x8_L2 256,64,5,0 + dcbt AO, T4 + KERNEL2x8_L2 256,64,6,0 + KERNEL2x8_E2 256,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,0,0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_L2 128,64,7,0 + KERNEL2x4_L2 128,64,8,0 + KERNEL2x4_L2 128,64,9,0 + KERNEL2x4_L2 128,64,10,0 + KERNEL2x4_L2 128,64,11,0 + KERNEL2x4_L2 128,64,12,0 + KERNEL2x4_L2 128,64,13,0 + KERNEL2x4_L2 128,64,14,0 + KERNEL2x4_L2 128,64,15,1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +ZGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_L2 128,64,3,0 + KERNEL2x4_L2 128,64,4,0 + KERNEL2x4_L2 128,64,5,0 + KERNEL2x4_L2 128,64,6,0 + KERNEL2x4_E2 128,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 128,64,0,0 + KERNEL2x4_L2 128,64,1,0 + KERNEL2x4_L2 128,64,2,0 + KERNEL2x4_E2 128,64,3,1 + blr + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,0,0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_L2 64,64,7,0 + KERNEL2x2_L2 64,64,8,0 + KERNEL2x2_L2 64,64,9,0 + KERNEL2x2_L2 64,64,10,0 + KERNEL2x2_L2 64,64,11,0 + KERNEL2x2_L2 64,64,12,0 + KERNEL2x2_L2 64,64,13,0 + KERNEL2x2_L2 64,64,14,0 + KERNEL2x2_L2 64,64,15,1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +ZGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_L2 64,64,3,0 + KERNEL2x2_L2 64,64,4,0 + KERNEL2x2_L2 64,64,5,0 + KERNEL2x2_L2 64,64,6,0 + KERNEL2x2_E2 64,64,7,1 + blr + MY_ALIGN +ZGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 64,64,0,0 + KERNEL2x2_L2 64,64,1,0 + KERNEL2x2_L2 64,64,2,0 + KERNEL2x2_E2 64,64,3,1 + blr + + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,0,0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_L2 32,64,7,0 + KERNEL2x1_L2 32,64,8,0 + KERNEL2x1_L2 32,64,9,0 + KERNEL2x1_L2 32,64,10,0 + KERNEL2x1_L2 32,64,11,0 + KERNEL2x1_L2 32,64,12,0 + KERNEL2x1_L2 32,64,13,0 + KERNEL2x1_L2 32,64,14,0 + KERNEL2x1_L2 32,64,15,1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +ZGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_L2 32,64,3,0 + KERNEL2x1_L2 32,64,4,0 + KERNEL2x1_L2 32,64,5,0 + KERNEL2x1_L2 32,64,6,0 + KERNEL2x1_E2 32,64,7,1 + blr + MY_ALIGN + + +ZGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 32,64,0,0 + KERNEL2x1_L2 32,64,1,0 + KERNEL2x1_L2 32,64,2,0 + KERNEL2x1_E2 32,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + ble ZGEMM_L2_END + + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8O 128,32 + END2x8_WITHOUT_ADD + LOAD2x8_2O 256, 64 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-256 + LOAD2x8_2O 256,64 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + bl ZGEMM_2x8_L64_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + bl ZGEMM_2x8_L32_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + bl ZGEMM_2x8_L16_SUB + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_L2 256,64, 1,0 + KERNEL2x8_L2 256,64, 2,0 + KERNEL2x8_E2 256,64, 3,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 256,64, 0,0 + KERNEL2x8_E2 256,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 256,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + KERNEL2x8 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt ZGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4O 64,32 + END2x4_WITHOUT_ADD + LOAD2x4_2O 128, 64 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD2x4_2O 128,64 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + bl ZGEMM_2x4_L16_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + bl ZGEMM_2x4_L8_SUB + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 128,64, 0,0 + KERNEL2x4_E2 128,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 128,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + KERNEL2x4 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2O 32,32 + END2x2_WITHOUT_ADD + LOAD2x2_2O 64, 64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD2x2_2O 64,64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + bl ZGEMM_2x2_L16_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + bl ZGEMM_2x2_L8_SUB + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 64,64, 0,0 + KERNEL2x2_E2 64,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 64,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + KERNEL2x2 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1O 16,32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD2x1_2O 32,64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + bl ZGEMM_2x1_L16_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + bl ZGEMM_2x1_L8_SUB + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32,64, 0,0 + KERNEL2x1_E2 32,64, 1,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32,64, 0,1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + bgt ZGEMM_L2_BEGIN + + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_L2 256,32,31,0 + KERNEL1x8_L2 256,32,32,0 + KERNEL1x8_L2 256,32,33,0 + KERNEL1x8_L2 256,32,34,0 + KERNEL1x8_L2 256,32,35,0 + KERNEL1x8_L2 256,32,36,0 + KERNEL1x8_L2 256,32,37,0 + KERNEL1x8_L2 256,32,38,0 + KERNEL1x8_L2 256,32,39,0 + KERNEL1x8_L2 256,32,40,0 + KERNEL1x8_L2 256,32,41,0 + KERNEL1x8_L2 256,32,42,0 + KERNEL1x8_L2 256,32,43,0 + KERNEL1x8_L2 256,32,44,0 + KERNEL1x8_L2 256,32,45,0 + KERNEL1x8_L2 256,32,46,0 + KERNEL1x8_L2 256,32,47,0 + KERNEL1x8_L2 256,32,48,0 + KERNEL1x8_L2 256,32,49,0 + KERNEL1x8_L2 256,32,50,0 + KERNEL1x8_L2 256,32,51,0 + KERNEL1x8_L2 256,32,52,0 + KERNEL1x8_L2 256,32,53,0 + KERNEL1x8_L2 256,32,54,0 + KERNEL1x8_L2 256,32,55,0 + KERNEL1x8_L2 256,32,56,0 + KERNEL1x8_L2 256,32,57,0 + KERNEL1x8_L2 256,32,58,0 + KERNEL1x8_L2 256,32,59,0 + KERNEL1x8_L2 256,32,60,0 + KERNEL1x8_L2 256,32,61,0 + KERNEL1x8_L2 256,32,62,0 + KERNEL1x8_L2 256,32,63,1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +ZGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_L2 256,32,15,0 + KERNEL1x8_L2 256,32,16,0 + KERNEL1x8_L2 256,32,17,0 + KERNEL1x8_L2 256,32,18,0 + KERNEL1x8_L2 256,32,19,0 + KERNEL1x8_L2 256,32,20,0 + KERNEL1x8_L2 256,32,21,0 + KERNEL1x8_L2 256,32,22,0 + KERNEL1x8_L2 256,32,23,0 + KERNEL1x8_L2 256,32,24,0 + KERNEL1x8_L2 256,32,25,0 + KERNEL1x8_L2 256,32,26,0 + KERNEL1x8_L2 256,32,27,0 + KERNEL1x8_L2 256,32,28,0 + KERNEL1x8_L2 256,32,29,0 + KERNEL1x8_L2 256,32,30,0 + KERNEL1x8_E2 256,32,31,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_L2 256,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 256,32,8,0 + KERNEL1x8_L2 256,32,9,0 + KERNEL1x8_L2 256,32,10,0 + KERNEL1x8_L2 256,32,11,0 + dcbt BO, T4 + KERNEL1x8_L2 256,32,12,0 + KERNEL1x8_L2 256,32,13,0 + KERNEL1x8_L2 256,32,14,0 + KERNEL1x8_E2 256,32,15,1 + blr + MY_ALIGN + + +ZGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 256,32,0,0 + KERNEL1x8_L2 256,32,1,0 + dcbt AO, T2 + KERNEL1x8_L2 256,32,2,0 + KERNEL1x8_L2 256,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 256,32,4,0 + KERNEL1x8_L2 256,32,5,0 + dcbt AO, T4 + KERNEL1x8_L2 256,32,6,0 + KERNEL1x8_E2 256,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,0,0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_L2 128,32,7,0 + KERNEL1x4_L2 128,32,8,0 + KERNEL1x4_L2 128,32,9,0 + KERNEL1x4_L2 128,32,10,0 + KERNEL1x4_L2 128,32,11,0 + KERNEL1x4_L2 128,32,12,0 + KERNEL1x4_L2 128,32,13,0 + KERNEL1x4_L2 128,32,14,0 + KERNEL1x4_L2 128,32,15,1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +ZGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_L2 128,32,3,0 + KERNEL1x4_L2 128,32,4,0 + KERNEL1x4_L2 128,32,5,0 + KERNEL1x4_L2 128,32,6,0 + KERNEL1x4_E2 128,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 128,32,0,0 + KERNEL1x4_L2 128,32,1,0 + KERNEL1x4_L2 128,32,2,0 + KERNEL1x4_E2 128,32,3,1 + blr + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,0,0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_L2 64,32,7,0 + KERNEL1x2_L2 64,32,8,0 + KERNEL1x2_L2 64,32,9,0 + KERNEL1x2_L2 64,32,10,0 + KERNEL1x2_L2 64,32,11,0 + KERNEL1x2_L2 64,32,12,0 + KERNEL1x2_L2 64,32,13,0 + KERNEL1x2_L2 64,32,14,0 + KERNEL1x2_L2 64,32,15,1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN + + +ZGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_L2 64,32,3,0 + KERNEL1x2_L2 64,32,4,0 + KERNEL1x2_L2 64,32,5,0 + KERNEL1x2_L2 64,32,6,0 + KERNEL1x2_E2 64,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 64,32,0,0 + KERNEL1x2_L2 64,32,1,0 + KERNEL1x2_L2 64,32,2,0 + KERNEL1x2_E2 64,32,3,1 + blr + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,0,0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_L2 32,32,7,0 + KERNEL1x1_L2 32,32,8,0 + KERNEL1x1_L2 32,32,9,0 + KERNEL1x1_L2 32,32,10,0 + KERNEL1x1_L2 32,32,11,0 + KERNEL1x1_L2 32,32,12,0 + KERNEL1x1_L2 32,32,13,0 + KERNEL1x1_L2 32,32,14,0 + KERNEL1x1_L2 32,32,15,1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +ZGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_L2 32,32,3,0 + KERNEL1x1_L2 32,32,4,0 + KERNEL1x1_L2 32,32,5,0 + KERNEL1x1_L2 32,32,6,0 + KERNEL1x1_E2 32,32,7,1 + blr + MY_ALIGN + + +ZGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 32,32,0,0 + KERNEL1x1_L2 32,32,1,0 + KERNEL1x1_L2 32,32,2,0 + KERNEL1x1_E2 32,32,3,1 + blr + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8O 128,16 + END1x8_WITHOUT_ADD + LOAD1x8_2O 256, 32 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-256 + LOAD1x8_2O 256,32 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + bl ZGEMM_1x8_L64_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + bl ZGEMM_1x8_L32_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + bl ZGEMM_1x8_L16_SUB + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_L2 256,32, 1,0 + KERNEL1x8_L2 256,32, 2,0 + KERNEL1x8_E2 256,32, 3,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 256,32, 0,0 + KERNEL1x8_E2 256,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 256,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + KERNEL1x8 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x4 + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4O 64,16 + END1x4_WITHOUT_ADD + LOAD1x4_2O 128, 32 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD1x4_2O 128,32 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + bl ZGEMM_1x4_L16_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + bl ZGEMM_1x4_L8_SUB + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 128,32, 0,0 + KERNEL1x4_E2 128,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 128,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + KERNEL1x4 + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x2 + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2O 32,16 + END1x2_WITHOUT_ADD + LOAD1x2_2O 64, 32 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD1x2_2O 64,32 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + bl ZGEMM_1x2_L16_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + bl ZGEMM_1x2_L8_SUB + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 64,32, 0,0 + KERNEL1x2_E2 64,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 64,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + KERNEL1x2 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1O 16,16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD1x1_2O 32,32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x1_SUB2_8 + bl ZGEMM_1x1_L16_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x1_SUB2_4 + bl ZGEMM_1x1_L8_SUB + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32,32, 0,0 + KERNEL1x1_E2 32,32, 1,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32,32, 0,1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S index 8670e9574e..68024b826d 100644 --- a/kernel/power/zgemm_macros_power9.S +++ b/kernel/power/zgemm_macros_power9.S @@ -1,1825 +1,1825 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 16 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) -/* HELPERS FOR SAVE */ -/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ - - -.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET -#ifndef TRMMKERNEL - lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) - lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) - xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 - xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 -#endif -.endm -/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ - - -.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ -.endm -/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ - - -.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ -.endm -/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ - - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead instead to fix sign*/ - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm -/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ - - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 -#ifndef TRMMKERNEL - xvmsubadp \VSOUT1,\VSINII, alpha_i - xvmaddadp \VSOUT2,\VSINRR, alpha_i -#else - xvmuldp \VSOUT1,\VSINII, alpha_i - xvmuldp \VSOUT2,\VSINRR, alpha_i -#endif -.endm -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubadp \VSOUT1,\VSINRR, alpha_r - xvmaddadp \VSOUT2,\VSINII, alpha_r -.endm -/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ - - -.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrghd \VSOUT1,\VSIN2,\VSIN1 - xxmrgld \VSOUT2,\VSIN2,\VSIN1 -.endm - - -.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 - stxv \VSIN1, DISPX(\LOFFSET)(\REG) - stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) -.endm - - -.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 - MULT_APLHA_PART1 vs6,vs8,vs16,vs17 - MULT_APLHA_PART2 vs2,vs4,vs14,vs15 - AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - MULT_APLHA_PART1 vs10,vs12, vs24,vs25 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - MULT_APLHA_PART2 vs10,vs12,vs24,vs25 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 - MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 - UNPACK_FOR_STORE vs24,vs25,vs10,vs12 - UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 - STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 - STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 -.endm - - -.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART1 vs6,vs8, vs16,vs17 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 -.endm - - - -.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 -.endm - - - -.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 -#ifndef TRMMKERNEL - lxv vs18, (\LOFFSET)(\BASE_REG) - xxmrgld vs14,vs18,vs18 - xxmrghd vs15,vs18,vs18 -#endif - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - xxmrghd vs7,vs15,vs14 - stxv vs7, (\LOFFSET)(\BASE_REG) -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,128,32 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x8_2 - /*for load2 offset will be 256 and 64*/ - KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 -.endm - - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs48, vs8, vs22 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs49, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs50, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs51, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs52, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs54, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs55, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs56, vs12, vs22 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs58, vs13, vs22 - xvmaddadp vs43, vs13, vs21 - xvmaddadp vs59, vs13, vs23 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs60, vs14, vs22 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs62, vs15, vs22 - xvmaddadp vs47, vs15, vs21 - xvmaddadp vs63, vs15, vs23 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 128,32 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,64,32 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 - -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_2 - /*for load2 offset will be 128 and 64*/ - KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs41, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs43, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs47, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 64,32 -.endm - - - -.macro SAVE2x4 - add T1, CO ,LDC - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,32,32 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 - -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_2 - /*for load2 offset will be 64 and 64*/ - KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 -.endm - - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs37, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs39, vs9, vs23 -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 32,32 -.endm - - - -.macro SAVE2x2 - add T1, CO ,LDC - SAVE2 vs32,vs33,vs34,vs35,CO,0 - SAVE2 vs36,vs37,vs38,vs39,T1,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,16,32 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_2 - /*for load2 offset will be 32 and 64*/ - KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 -.endm - - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs35, vs8, vs23 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 16,32 -.endm - - - -.macro SAVE2x1 - add T1, CO ,LDC - SAVE1 vs32,vs33,CO,0 - SAVE1 vs34,vs35,T1,0 - addi CO, CO, 16 -.endm - -/********************************************************************************************** -* - -.macros for N=1 and M=8 -**********************************************************************************************/ - - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,128,16 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 - -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x8_2 - /*for load2 offset will be 256 and 32*/ - KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 -.endm - - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs43, vs13, vs21 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs47, vs15, vs21 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 128,16 -.endm - - -.macro SAVE1x8 - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,64,16 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x4_2 - /*for load2 offset will be 128 and 32*/ - KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 -.endm - - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 64,16 -.endm - - - -.macro SAVE1x4 - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,32,16 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x2_2 - /*for load2 offset will be 64 and 32*/ - KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 -.endm - - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 32,16 -.endm - - - -.macro SAVE1x2 - SAVE2 vs32,vs33,vs34,vs35,CO,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - xxswapd vs17, vs16 - -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,16,16 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x1_2 - /*for load2 offset will be 32 and 32*/ - KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 -.endm - - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 16,16 -.endm - - - -.macro SAVE1x1 - SAVE1 vs32,vs33,CO,0 - addi CO, CO, 16 -.endm - -/****************************TRMM POINTER REFRESH - -.macroSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 8 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 4 - .endif -.endm -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ - - -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ - - -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - #endif - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif +/*************************************************************************** +Copyright (c) 2013-2019, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs6,vs8,vs16,vs17 + MULT_APLHA_PART2 vs2,vs4,vs14,vs15 + AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + MULT_APLHA_PART1 vs10,vs12, vs24,vs25 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + MULT_APLHA_PART2 vs10,vs12,vs24,vs25 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 + UNPACK_FOR_STORE vs24,vs25,vs10,vs12 + UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART1 vs6,vs8, vs16,vs17 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs6,vs8,vs16,vs17 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + UNPACK_FOR_STORE vs16,vs17,vs3,vs5 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 +.endm + + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 + LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 +.endm + + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 +#ifndef TRMMKERNEL + lxv vs18, (\LOFFSET)(\BASE_REG) + xxmrgld vs14,vs18,vs18 + xxmrghd vs15,vs18,vs18 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 + AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 + MULT_APLHA_PART1 vs2,vs4, vs14,vs15 + MULT_APLHA_PART2 vs2,vs4, vs14,vs15 + UNPACK_FOR_STORE vs14,vs15,vs7,vs9 + xxmrghd vs7,vs15,vs14 + stxv vs7, (\LOFFSET)(\BASE_REG) +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro Zero2x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 + xxlxor vs49, vs49, vs49 + xxlxor vs50, vs50, vs50 + xxlxor vs51, vs51, vs51 + xxlxor vs52, vs52, vs52 + xxlxor vs53, vs53, vs53 + xxlxor vs54, vs54, vs54 + xxlxor vs55, vs55, vs55 + xxlxor vs56, vs56, vs56 + xxlxor vs57, vs57, vs57 + xxlxor vs58, vs58, vs58 + xxlxor vs59, vs59, vs59 + xxlxor vs60, vs60, vs60 + xxlxor vs61, vs61, vs61 + xxlxor vs62, vs62, vs62 + xxlxor vs63, vs63, vs63 +.endm + + +.macro LOAD2x8 + LOAD2x8O 0,0 +.endm + + +.macro LOAD2x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x8_NORMAL + END2x8 AO,BO,128,32 +.endm + + +.macro END2x8_WITHOUT_ADD + END2x8 AO,BO,0,0 +.endm + + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.endm + + +.macro LOAD2x8_2 + LOAD2x8_2O 0,0 +.endm + + +.macro LOAD2x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x8_2 + /*for load2 offset will be 256 and 64*/ + KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 +.endm + + + +.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs48, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs49, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs50, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs51, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs52, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs53, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs54, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs55, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs56, vs4, vs18 + xvmaddadp vs41, vs4, vs17 + xvmaddadp vs57, vs4, vs19 + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs58, vs5, vs18 + xvmaddadp vs43, vs5, vs17 + xvmaddadp vs59, vs5, vs19 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs60, vs6, vs18 + xvmaddadp vs45, vs6, vs17 + xvmaddadp vs61, vs6, vs19 + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs62, vs7, vs18 + xvmaddadp vs47, vs7, vs17 + xvmaddadp vs63, vs7, vs19 +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs48, vs8, vs22 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs49, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs50, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs51, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs52, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs53, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs54, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs55, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs56, vs12, vs22 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs57, vs12, vs23 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs58, vs13, vs22 + xvmaddadp vs43, vs13, vs21 + xvmaddadp vs59, vs13, vs23 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs60, vs14, vs22 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs61, vs14, vs23 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs62, vs15, vs22 + xvmaddadp vs47, vs15, vs21 + xvmaddadp vs63, vs15, vs23 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + + + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 128,32 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero2x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 +.endm + + +.macro LOAD2x4 + LOAD2x4O 0,0 +.endm + + +.macro LOAD2x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_NORMAL + END2x4 AO,BO,64,32 +.endm + + +.macro END2x4_WITHOUT_ADD + END2x4 AO,BO,0,0 +.endm + + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 + +.endm + + +.macro LOAD2x4_2 + LOAD2x4_2O 0,0 +.endm + + +.macro LOAD2x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x4_2 + /*for load2 offset will be 128 and 64*/ + KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 +.endm + + + +.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs40, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs41, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs42, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs43, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs44, vs2, vs18 + xvmaddadp vs37, vs2, vs17 + xvmaddadp vs45, vs2, vs19 + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs46, vs3, vs18 + xvmaddadp vs39, vs3, vs17 + xvmaddadp vs47, vs3, vs19 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs40, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs41, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs42, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs43, vs9, vs23 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs44, vs10, vs22 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs45, vs10, vs23 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs46, vs11, vs22 + xvmaddadp vs39, vs11, vs21 + xvmaddadp vs47, vs11, vs23 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 64,32 +.endm + + + +.macro SAVE2x4 + add T1, CO ,LDC + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero2x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + +.endm + + +.macro LOAD2x2 + LOAD2x2O 0,0 +.endm + + +.macro LOAD2x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_NORMAL + END2x2 AO,BO,32,32 +.endm + + +.macro END2x2_WITHOUT_ADD + END2x2 AO,BO,0,0 +.endm + + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 + +.endm + + +.macro LOAD2x2_2 + LOAD2x2_2O 0,0 +.endm + + +.macro LOAD2x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END2x2_2 + /*for load2 offset will be 64 and 64*/ + KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 +.endm + + + +.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs36, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs37, vs0, vs19 + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs38, vs1, vs18 + xvmaddadp vs35, vs1, vs17 + xvmaddadp vs39, vs1, vs19 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs36, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs37, vs8, vs23 +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs38, vs9, vs22 + xvmaddadp vs35, vs9, vs21 + xvmaddadp vs39, vs9, vs23 +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 32,32 +.endm + + + +.macro SAVE2x2 + add T1, CO ,LDC + SAVE2 vs32,vs33,vs34,vs35,CO,0 + SAVE2 vs36,vs37,vs38,vs39,T1,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_NORMAL + END2x1 AO,BO,16,32 +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs18, (\OffsetB+16)(BO) // load real,imag from B + lxv vs20, (\OffsetB+32)(BO) // load real,imag from B + lxv vs22, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs17, vs16 + xxswapd vs19, vs18 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xxswapd vs23, vs22 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs34, vs0, vs18 + xvmaddadp vs33, vs0, vs17 + xvmaddadp vs35, vs0, vs19 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 + xxswapd vs19, vs18 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs34, vs8, vs22 + xvmaddadp vs33, vs8, vs21 + xvmaddadp vs35, vs8, vs23 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs32,vs33,CO,0 + SAVE1 vs34,vs35,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + + +.macro Zero1x8 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 + xxlxor vs40, vs40, vs40 + xxlxor vs41, vs41, vs41 + xxlxor vs42, vs42, vs42 + xxlxor vs43, vs43, vs43 + xxlxor vs44, vs44, vs44 + xxlxor vs45, vs45, vs45 + xxlxor vs46, vs46, vs46 + xxlxor vs47, vs47, vs47 + xxlxor vs48, vs48, vs48 +.endm + + +.macro LOAD1x8 + LOAD1x8O 0,0 +.endm + + +.macro LOAD1x8O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x8_NORMAL + END1x8 AO,BO,128,16 +.endm + + +.macro END1x8_WITHOUT_ADD + END1x8 AO,BO,0,0 +.endm + + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 + + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 + +.endm + + +.macro LOAD1x8_2 + LOAD1x8_2O 0,0 +.endm + + +.macro LOAD1x8_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs4, (64+\OffsetA)(AO) // load real,imag from A + lxv vs5, (80+\OffsetA)(AO) // load real,imag from A + lxv vs6, (96+\OffsetA)(AO) // load real,imag from A + lxv vs7, (112+\OffsetA)(AO) // load real,imag from A + lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A + lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A + lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A + lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A + lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A + lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A + lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A + lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x8_2 + /*for load2 offset will be 256 and 32*/ + KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 +.endm + + + +.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs4, vs16 + xvmaddadp vs41, vs4, vs17 + + xvmaddadp vs42, vs5, vs16 + xvmaddadp vs43, vs5, vs17 +.if \Complete==0 + lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A + lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs6, vs16 + xvmaddadp vs45, vs6, vs17 + + xvmaddadp vs46, vs7, vs16 + xvmaddadp vs47, vs7, vs17 +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs40, vs12, vs20 + xvmaddadp vs41, vs12, vs21 + xvmaddadp vs42, vs13, vs20 + xvmaddadp vs43, vs13, vs21 +.if \Complete==0 + lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A + lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs44, vs14, vs20 + xvmaddadp vs45, vs14, vs21 + xvmaddadp vs46, vs15, vs20 + xvmaddadp vs47, vs15, vs21 +.if \Complete==0 + lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP16(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP16(\Index,256) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + + + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 128,16 +.endm + + +.macro SAVE1x8 + SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 + addi CO, CO, 128 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + + +.macro Zero1x4 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + xxlxor vs36, vs36, vs36 + xxlxor vs37, vs37, vs37 + xxlxor vs38, vs38, vs38 + xxlxor vs39, vs39, vs39 +.endm + + +.macro LOAD1x4 + LOAD1x4O 0,0 +.endm + + +.macro LOAD1x4O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x4_NORMAL + END1x4 AO,BO,64,16 +.endm + + +.macro END1x4_WITHOUT_ADD + END1x4 AO,BO,0,0 +.endm + + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 + +.endm + + +.macro LOAD1x4_2 + LOAD1x4_2O 0,0 +.endm + + +.macro LOAD1x4_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs2, (32+\OffsetA)(AO) // load real,imag from A + lxv vs3, (48+\OffsetA)(AO) // load real,imag from A + lxv vs8, (64+\OffsetA)(AO) // load real,imag from A + lxv vs9, (80+\OffsetA)(AO) // load real,imag from A + lxv vs10, (96+\OffsetA)(AO) // load real,imag from A + lxv vs11, (112+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x4_2 + /*for load2 offset will be 128 and 32*/ + KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 +.endm + + + +.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs2, vs16 + xvmaddadp vs37, vs2, vs17 + + xvmaddadp vs38, vs3, vs16 + xvmaddadp vs39, vs3, vs17 +.if \Complete==0 + lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + xvmaddadp vs36, vs10, vs20 + xvmaddadp vs37, vs10, vs21 + xvmaddadp vs38, vs11, vs20 + xvmaddadp vs39, vs11, vs21 +.if \Complete==0 + lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A + lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP8(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP8(\Index,128) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 64,16 +.endm + + + +.macro SAVE1x4 + SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 + addi CO, CO, 64 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + + +.macro Zero1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 + xxlxor vs34, vs34, vs34 + xxlxor vs35, vs35, vs35 + +.endm + + +.macro LOAD1x2 + LOAD1x2O 0,0 +.endm + + +.macro LOAD1x2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + +.endm + + +.macro END1x2_NORMAL + END1x2 AO,BO,32,16 +.endm + + +.macro END1x2_WITHOUT_ADD + END1x2 AO,BO,0,0 +.endm + + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 + +.endm + + +.macro LOAD1x2_2 + LOAD1x2_2O 0,0 +.endm + + +.macro LOAD1x2_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs1, (16+\OffsetA)(AO) // load real,imag from A + lxv vs8, (32+\OffsetA)(AO) // load real,imag from A + lxv vs9, (48+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x2_2 + /*for load2 offset will be 64 and 32*/ + KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 +.endm + + + +.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 + xxswapd vs21, vs20 + xvmaddadp vs34, vs1, vs16 + xvmaddadp vs35, vs1, vs17 +.if \Complete==0 + lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A + lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs34, vs9, vs20 + xvmaddadp vs35, vs9, vs21 +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A + lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A +.endif + + + +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index,64) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 32,16 +.endm + + + +.macro SAVE1x2 + SAVE2 vs32,vs33,vs34,vs35,CO,0 + addi CO, CO, 32 +.endm +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + + + +.macro Zero1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs33, vs33, vs33 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs17, vs16 + +.endm + + +.macro END1x1_NORMAL + END1x1 AO,BO,16,16 +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs20, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs17, vs16 + + lxv vs0, (0+\OffsetA)(AO) // load real,imag from A + lxv vs8, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs21, vs20 + xvmaddadp vs32, vs0, vs16 + xvmaddadp vs33, vs0, vs17 +.if \Complete==0 + lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs17, vs16 +.endif + xvmaddadp vs32, vs8, vs20 + xvmaddadp vs33, vs8, vs21 +.if \Complete==0 + lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs32,vs33,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif .endm \ No newline at end of file diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c index 46a84ad189..bf55998ca9 100644 --- a/kernel/riscv64/dot.c +++ b/kernel/riscv64/dot.c @@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; - if ( n < 0 ) return(dot); + if ( n < 1 ) return(dot); while(i < n) { diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 8bcd31ef28..14a339e758 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = { MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), #endif + SBGEMM_ALIGN_K, + sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS, @@ -135,9 +137,14 @@ gotoblas_t TABLE_NAME = { 0, #endif -#if (BUILD_SINGLE==1 ) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) +#if (BUILD_SINGLE==1 ) || (BUILD_COMPLEX==1) samax_kTS, samin_kTS, smax_kTS, smin_kTS, - isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) + isamax_kTS, +#endif +#if (BUILD_SINGLE==1 ) || (BUILD_COMPLEX==1) + isamin_kTS, ismax_kTS, ismin_kTS, snrm2_kTS, sasum_kTS, #endif #if BUILD_SINGLE == 1 @@ -156,8 +163,10 @@ gotoblas_t TABLE_NAME = { sswap_kTS, sgemv_nTS, sgemv_tTS, #endif -#if BUILD_SINGLE == 1 +#if BUILD_SINGLE == 1 sger_kTS, +#endif +#if BUILD_SINGLE == 1 ssymv_LTS, ssymv_UTS, #endif @@ -176,7 +185,7 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif -#if BUILD_SINGLE == 1 +#if BUILD_SINGLE == 1 || BUILD_DOUBLE == 1 || BUILD_COMPLEX == 1 #ifdef SMALL_MATRIX_OPT sgemm_small_matrix_permitTS, sgemm_small_kernel_nnTS, sgemm_small_kernel_ntTS, sgemm_small_kernel_tnTS, sgemm_small_kernel_ttTS, @@ -184,7 +193,7 @@ gotoblas_t TABLE_NAME = { #endif #endif -#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX == 1) strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, @@ -196,7 +205,7 @@ gotoblas_t TABLE_NAME = { strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, #endif -#if BUILD_SINGLE == 1 +#if (BUILD_SINGLE==1) strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, @@ -213,8 +222,6 @@ gotoblas_t TABLE_NAME = { ssymm_outcopyTS, ssymm_oltcopyTS, #endif ssymm_outcopyTS, ssymm_oltcopyTS, -#endif -#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) #ifndef NO_LAPACK sneg_tcopyTS, slaswp_ncopyTS, #else @@ -222,7 +229,7 @@ gotoblas_t TABLE_NAME = { #endif #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN @@ -233,7 +240,7 @@ gotoblas_t TABLE_NAME = { #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, dnrm2_kTS, dasum_kTS, @@ -241,13 +248,13 @@ gotoblas_t TABLE_NAME = { #if (BUILD_DOUBLE==1) dsum_kTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) dcopy_kTS, ddot_kTS, #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) dsdot_kTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) drot_kTS, daxpy_kTS, dscal_kTS, @@ -259,7 +266,7 @@ gotoblas_t TABLE_NAME = { dsymv_LTS, dsymv_UTS, #endif -#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, @@ -269,12 +276,14 @@ gotoblas_t TABLE_NAME = { dgemm_oncopyTS, dgemm_otcopyTS, #endif -#if (BUILD_DOUBLE==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) #ifdef SMALL_MATRIX_OPT dgemm_small_matrix_permitTS, dgemm_small_kernel_nnTS, dgemm_small_kernel_ntTS, dgemm_small_kernel_tnTS, dgemm_small_kernel_ttTS, dgemm_small_kernel_b0_nnTS, dgemm_small_kernel_b0_ntTS, dgemm_small_kernel_b0_tnTS, dgemm_small_kernel_b0_ttTS, #endif +#endif +#if (BUILD_DOUBLE==1) dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -364,7 +373,7 @@ gotoblas_t TABLE_NAME = { #endif -#if (BUILD_COMPLEX || BUILD_COMPLEX16) +#if (BUILD_COMPLEX) 0, 0, 0, CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, #ifdef CGEMM_DEFAULT_UNROLL_MN @@ -372,18 +381,23 @@ gotoblas_t TABLE_NAME = { #else MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N), #endif - camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, +#if (BUILD_COMPLEX) + camax_kTS, camin_kTS, +#endif +#if (BUILD_COMPLEX) + icamax_kTS, #endif #if (BUILD_COMPLEX) + icamin_kTS, cnrm2_kTS, casum_kTS, csum_kTS, #endif -#if (BUILD_COMPLEX || BUILD_COMPLEX16) - ccopy_kTS, cdotu_kTS, cdotc_kTS, +#if (BUILD_COMPLEX) + ccopy_kTS, cdotu_kTS, cdotc_kTS, #endif #if (BUILD_COMPLEX) csrot_kTS, #endif -#if (BUILD_COMPLEX || BUILD_COMPLEX16) +#if (BUILD_COMPLEX) caxpy_kTS, caxpyc_kTS, cscal_kTS, @@ -397,7 +411,7 @@ gotoblas_t TABLE_NAME = { csymv_LTS, csymv_UTS, chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, #endif -#if (BUILD_COMPLEX || BUILD_COMPLEX16) +#if (BUILD_COMPLEX) cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, cgemm_betaTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N @@ -432,6 +446,7 @@ gotoblas_t TABLE_NAME = { ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, #endif +#endif #if (BUILD_COMPLEX) ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, @@ -522,7 +537,7 @@ gotoblas_t TABLE_NAME = { #endif #endif -#if (BUILD_COMPLEX || BUILD_COMPLEX16) +#if (BUILD_COMPLEX) #ifndef NO_LAPACK cneg_tcopyTS, @@ -866,7 +881,7 @@ gotoblas_t TABLE_NAME = { cgeadd_kTS, #endif #if BUILD_COMPLEX16==1 - zgeadd_kTS + zgeadd_kTS, #endif }; @@ -878,7 +893,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE == 1 +#if BUILD_DOUBLE == 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX==1 @@ -891,10 +906,10 @@ static void init_parameter(void) { #if (BUILD_BFLOAT16) TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif -#if BUILD_SINGLE == 1 +#if BUILD_SINGLE == 1 || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; #endif -#if BUILD_DOUBLE== 1 +#if BUILD_DOUBLE== 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; #endif #if BUILD_COMPLEX== 1 @@ -907,10 +922,10 @@ static void init_parameter(void) { #if (BUILD_BFLOAT16) TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif -#if BUILD_SINGLE == 1 +#if BUILD_SINGLE == 1 || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; #endif -#if BUILD_DOUBLE==1 +#if BUILD_DOUBLE==1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; #endif #if BUILD_COMPLEX==1 @@ -1313,7 +1328,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 64 * (l2 >> 7); #endif -#if BUILD_DOUBLE == 1 +#if BUILD_DOUBLE == 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 32 * (l2 >> 7); #endif #if BUILD_COMPLEX==1 @@ -1337,7 +1352,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 96 * (l2 >> 7); #endif -#if BUILD_DOUBLE == 1 +#if BUILD_DOUBLE == 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 48 * (l2 >> 7); #endif #if BUILD_COMPLEX==1 @@ -1361,7 +1376,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 256; #endif -#if BUILD_DOUBLE ==1 +#if BUILD_DOUBLE ==1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 128; #endif #if BUILD_COMPLEX==1 @@ -1385,7 +1400,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 56 * (l2 >> 7); #endif -#if BUILD_DOUBLE ==1 +#if BUILD_DOUBLE ==1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 28 * (l2 >> 7); #endif #if BUILD_COMPLEX==1 @@ -1409,7 +1424,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8; #endif -#if BUILD_DOUBLE==1 +#if BUILD_DOUBLE==1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8; #endif #if BUILD_COMPLEX==1 @@ -1433,7 +1448,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; #endif -#if BUILD_DOUBLE == 1 +#if BUILD_DOUBLE == 1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; #endif #if BUILD_COMPLEX==1 @@ -1457,7 +1472,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; #endif -#if BUILD_DOUBLE ==1 +#if BUILD_DOUBLE ==1 || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; #endif #if BUILD_COMPLEX==1 @@ -1482,7 +1497,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1506,7 +1521,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1554,7 +1569,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1579,7 +1594,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); #endif #if BUILD_COMPLEX @@ -1603,7 +1618,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1627,7 +1642,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1651,7 +1666,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1675,7 +1690,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1700,7 +1715,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1724,7 +1739,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if BUILD_DOUBLE || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1748,7 +1763,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if BUILD_DOUBLE +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if BUILD_COMPLEX @@ -1773,7 +1788,7 @@ static void init_parameter(void) { #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#if (BUILD_DOUBLE==1) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif #if (BUILD_COMPLEX==1) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 81eaf96ac1..aaf686c9f7 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,3 +1,4 @@ +SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index cb6f629814..548e5dcfcf 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -44,8 +44,5 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c -CSCALKERNEL = ../arm/zscal.c -ZSCALKERNEL = ../arm/zscal.c - CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index a66394be3a..9978202a74 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -1,3 +1,4 @@ +SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c CSCALKERNEL = cscal.c ZSCALKERNEL = zscal.c diff --git a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S index 97958a88f5..2675f71fb9 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S +++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S @@ -1,1897 +1,1897 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddps -#define VFMADD_I vfmaddps -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddps -#define VFMADD_I vfmaddps -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddps -#define VFMADD_I vfnmaddps -#else -#define VFMADD_R vfnmaddps -#define VFMADD_I vfnmaddps -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - - - -/************************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $8, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_20: - testq $3, M - jz .L2_60 // to next 2 lines of N - - testq $2, M - jz .L2_40 - ALIGN_4 - -.L2_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL2x2_SUB(xxx) - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - - vmovsd %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_20: - testq $3, M - jz .L999 - - testq $2, M - jz .L1_40 - ALIGN_4 - -.L1_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL2x1_SUB(xxx) - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_40: - testq $1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S index 72deee12f2..bf7f91ee9e 100644 --- a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S +++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S @@ -1,1921 +1,1921 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/31 Saar -* -* Parameter: -* UNROLL_M 4 -* UNROLL_N 2 -* CGEMM_P 768 -* CGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) -* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) -* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) -* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) -* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) -* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) -* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) -* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) -* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddps -#define VFMADD_I vfmaddps -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddps -#define VFMADD_I vfmaddps -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddps -#define VFMADD_I vfnmaddps -#else -#define VFMADD_R vfnmaddps -#define VFMADD_I vfnmaddps -#endif - - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL4x2_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - - - -/************************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_2(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL4x1_4(xx) \ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - - -#define KERNEL4x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $8, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_3(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - vshufps $0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - vshufps $0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_20: - testq $3, M - jz .L2_60 // to next 2 lines of N - - testq $2, M - jz .L2_40 - ALIGN_4 - -.L2_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL2x2_SUB(xxx) - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - - vmovsd %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = (m >> 2) - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_20: - testq $3, M - jz .L999 - - testq $2, M - jz .L1_40 - ALIGN_4 - -.L1_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL2x1_SUB(xxx) - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_40: - testq $1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - - - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 4 +* UNROLL_N 2 +* CGEMM_P 768 +* CGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 ) +* 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 ) +* 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 ) +* 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 ) +* 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 ) +* 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 ) +* 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 ) +* 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 ) +* 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddps +#define VFMADD_I vfmaddps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddps +#define VFMADD_I vfmaddps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddps +#define VFMADD_I vfnmaddps +#else +#define VFMADD_R vfnmaddps +#define VFMADD_I vfnmaddps +#endif + + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL4x2_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + + + +/************************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_2(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL4x1_4(xx) \ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + + +#define KERNEL4x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $8, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_3(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + vshufps $0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + vshufps $0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_20: + testq $3, M + jz .L2_60 // to next 2 lines of N + + testq $2, M + jz .L2_40 + ALIGN_4 + +.L2_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL2x2_SUB(xxx) + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + + vmovsd %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = (m >> 2) + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_20: + testq $3, M + jz .L999 + + testq $2, M + jz .L1_40 + ALIGN_4 + +.L1_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL2x1_SUB(xxx) + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_40: + testq $1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + + + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemm_kernel_8x2_sandy.S b/kernel/x86_64/cgemm_kernel_8x2_sandy.S index c85646d439..988913591c 100644 --- a/kernel/x86_64/cgemm_kernel_8x2_sandy.S +++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S @@ -1,2353 +1,2353 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/29 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* CGEMM_DEFAULT_UNROLL_N 2 -* CGEMM_DEFAULT_UNROLL_M 8 -* CGEMM_DEFAULT_P 768 -* CGEMM_DEFAULT_Q 512 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/29 Saar -* Performance at 6192x6192x6192: -* 1 thread: 49 GFLOPS (MKL: 52) -* 2 threads: 99 GFLOPS (MKL: 102) -* 3 threads: 148 GFLOPS (MKL: 150) -* 4 threads: 195 GFLOPS (MKL: 194) -* 8 threads: 354 GFLOPS (MKL: 317) -* -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vaddps y0,%ymm2,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vaddps y0,%ymm3,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vaddps y0,%xmm2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vaddps y0,%xmm3,y0 - - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vsubps %ymm2,y0,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vaddps y0,%ymm3,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vsubps %xmm2,y0,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vaddps y0,%xmm3,y0 - - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vaddps y0,%ymm2,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vsubps %ymm3,y0,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vaddps y0,%xmm2,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vsubps %xmm3,y0,y0 - - -#else - -#define VFMADDPS_YR( y0,y1,y2 ) \ - vmulps y1,y2,%ymm2;\ - vsubps %ymm2,y0,y0 - -#define VFMADDPS_YI( y0,y1,y2 ) \ - vmulps y1,y2,%ymm3;\ - vsubps %ymm3,y0,y0 - -#define VFMADDPS_R( y0,y1,y2 ) \ - vmulps y1,y2,%xmm2;\ - vsubps %xmm2,y0,y0 - -#define VFMADDPS_I( y0,y1,y2 ) \ - vmulps y1,y2,%xmm3;\ - vsubps %xmm3,y0,y0 - - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/***************************************************************************************************************************/ - -.macro KERNEL8x2_1 - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - prefetcht0 A_PR1(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+64(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+128(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 - prefetcht0 A_PR1+192(AO, %rax, SIZE) - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - addq $ 16, BI - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - addq $ 64, %rax -.endm - - -.macro KERNEL8x2_SUB - - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 - - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - - - VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) - VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) - VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) - VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) - - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm10, %ymm11,%ymm11 - vaddsubps %ymm12, %ymm13,%ymm13 - vaddsubps %ymm14, %ymm15,%ymm15 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm11, %ymm10 - vmovaps %ymm13, %ymm12 - vmovaps %ymm15, %ymm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm10, %ymm0, %ymm10 - vmulps %ymm12, %ymm0, %ymm12 - vmulps %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm11, %ymm1, %ymm11 - vmulps %ymm13, %ymm1, %ymm13 - vmulps %ymm15, %ymm1, %ymm15 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm11,%ymm10, %ymm10 - vaddsubps %ymm13,%ymm12, %ymm12 - vaddsubps %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - - vaddps (CO1, LDC), %ymm10, %ymm10 - vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 8 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - vaddsubps %xmm12, %xmm13,%xmm13 - vaddsubps %xmm14, %xmm15,%xmm15 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - vmovaps %xmm13, %xmm12 - vmovaps %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - vmulps %xmm12, %xmm0, %xmm12 - vmulps %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - vmulps %xmm13, %xmm1, %xmm13 - vmulps %xmm15, %xmm1, %xmm15 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - vaddsubps %xmm13,%xmm12, %xmm12 - vaddsubps %xmm15,%xmm14, %xmm14 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - - vaddps (CO1, LDC), %xmm10, %xmm10 - vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 4 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - - vmovsd (CO1, LDC), %xmm15 - vaddps %xmm15, %xmm10, %xmm10 - -#endif - - vmovsd %xmm8 , (CO1) - vmovsd %xmm10 , (CO1, LDC) - -.endm - -/************************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 - VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) - VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) - VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA_R, %ymm0 - vbroadcastss ALPHA_I, %ymm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 - vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 - -#else - vaddsubps %ymm8, %ymm9 ,%ymm9 - vaddsubps %ymm12, %ymm13,%ymm13 - - vmovaps %ymm9, %ymm8 - vmovaps %ymm13, %ymm12 - - // swap high and low 64 bytes - vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 - vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulps %ymm8 , %ymm0, %ymm8 - vmulps %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulps %ymm9 , %ymm1, %ymm9 - vmulps %ymm13, %ymm1, %ymm13 - - vaddsubps %ymm9, %ymm8 , %ymm8 - vaddsubps %ymm13,%ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %ymm8 , %ymm8 - vaddps 8 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 8 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm12, %xmm13,%xmm13 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm13, %xmm12 - - // swap high and low 4 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm13, %xmm1, %xmm13 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm13,%xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - vaddps 4 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 4 * SIZE(CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - - vmovaps %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - - vaddsubps %xmm9, %xmm8 , %xmm8 - -#ifndef TRMMKERNEL - - vmovsd (CO1), %xmm14 - vaddps %xmm14, %xmm8 , %xmm8 - -#endif - - vmovsd %xmm8 , (CO1) - -.endm - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA_R - vmovss %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L2_4_10 - - ALIGN_4 -/**********************************************************************************************************/ - -.L2_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - je .L2_8_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x2_1 - - je .L2_8_16 - - jmp .L2_8_12 - ALIGN_4 - -.L2_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_8_17: - - KERNEL8x2_SUB - - jl .L2_8_17 - ALIGN_4 - - -.L2_8_19: - - SAVE8x2 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_8_11 - ALIGN_4 - - -/**********************************************************************************************************/ - - - - -.L2_4_10: - testq $ 7, M - jz .L2_4_60 // to next 2 lines of N - - testq $ 4, M - jz .L2_4_20 - ALIGN_4 - - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L2_4_20: - - testq $ 2, M - jz .L2_4_40 - ALIGN_4 - -.L2_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_4_26 - - jmp .L2_4_22 - ALIGN_4 - -.L2_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_27: - - KERNEL2x2_SUB - - jl .L2_4_27 - ALIGN_4 - - -.L2_4_29: - - vbroadcastss ALPHA_R, %xmm0 - vbroadcastss ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 - vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 - -#else - vaddsubps %xmm8, %xmm9 ,%xmm9 - vaddsubps %xmm10, %xmm11,%xmm11 - - vmovaps %xmm9, %xmm8 - vmovaps %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 - vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulps %xmm8 , %xmm0, %xmm8 - vmulps %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulps %xmm9 , %xmm1, %xmm9 - vmulps %xmm11, %xmm1, %xmm11 - - vaddsubps %xmm9, %xmm8 , %xmm8 - vaddsubps %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddps (CO1), %xmm8 , %xmm8 - - vaddps (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - - vmovups %xmm10 , (CO1, LDC) - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_4_21 - ALIGN_4 - - - -/**************************************************************************/ -.L2_4_40: - testq $ 1, M - jz .L2_4_60 // to next 2 lines of N - - ALIGN_4 - -.L2_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_4_46 - - jmp .L2_4_42 - ALIGN_4 - -.L2_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_47: - - KERNEL1x2_SUB - - jl .L2_4_47 - ALIGN_4 - - -.L2_4_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_4_41 - ALIGN_4 - - - - -.L2_4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $ 3, I // i = (m >> 3) - je .L1_4_10 - - ALIGN_4 - -/**************************************************************************************************/ - -.L1_8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 8, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_8_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL8x1_SUB - - je .L1_8_16 - - jmp .L1_8_12 - ALIGN_4 - -.L1_8_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_8_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_8_17: - - KERNEL8x1_SUB - - jl .L1_8_17 - ALIGN_4 - - -.L1_8_19: - - SAVE8x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 4, %rax // rax = rax *16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 8, KK -#endif - - addq $ 16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_8_11 - ALIGN_4 - - - -/**************************************************************************************************/ -.L1_4_10: - - testq $ 7, M - jz .L999 - - testq $ 4, M - jz .L1_4_20 - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/************************************************************************** -* Rest of M -***************************************************************************/ - -.L1_4_20: - - testq $ 2, M - jz .L1_4_40 - ALIGN_4 - -.L1_4_21: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_26 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_22: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_4_26 - - jmp .L1_4_22 - ALIGN_4 - -.L1_4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_29 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_27: - - KERNEL2x1_SUB - - jl .L1_4_27 - ALIGN_4 - - -.L1_4_29: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - - -/**************************************************************************/ -.L1_4_40: - testq $ 1, M - jz .L999 // to next 2 lines of N - - ALIGN_4 - -.L1_4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_4_46 - - jmp .L1_4_42 - ALIGN_4 - -.L1_4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_47: - - KERNEL1x1_SUB - - jl .L1_4_47 - ALIGN_4 - - -.L1_4_49: - - SAVE1x1 - - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* CGEMM_DEFAULT_UNROLL_N 2 +* CGEMM_DEFAULT_UNROLL_M 8 +* CGEMM_DEFAULT_P 768 +* CGEMM_DEFAULT_Q 512 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/29 Saar +* Performance at 6192x6192x6192: +* 1 thread: 49 GFLOPS (MKL: 52) +* 2 threads: 99 GFLOPS (MKL: 102) +* 3 threads: 148 GFLOPS (MKL: 150) +* 4 threads: 195 GFLOPS (MKL: 194) +* 8 threads: 354 GFLOPS (MKL: 317) +* +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vaddps y0,%ymm2,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vaddps y0,%ymm3,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vaddps y0,%xmm2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vaddps y0,%xmm3,y0 + + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vsubps %ymm2,y0,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vaddps y0,%ymm3,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vsubps %xmm2,y0,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vaddps y0,%xmm3,y0 + + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vaddps y0,%ymm2,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vsubps %ymm3,y0,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vaddps y0,%xmm2,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vsubps %xmm3,y0,y0 + + +#else + +#define VFMADDPS_YR( y0,y1,y2 ) \ + vmulps y1,y2,%ymm2;\ + vsubps %ymm2,y0,y0 + +#define VFMADDPS_YI( y0,y1,y2 ) \ + vmulps y1,y2,%ymm3;\ + vsubps %ymm3,y0,y0 + +#define VFMADDPS_R( y0,y1,y2 ) \ + vmulps y1,y2,%xmm2;\ + vsubps %xmm2,y0,y0 + +#define VFMADDPS_I( y0,y1,y2 ) \ + vmulps y1,y2,%xmm3;\ + vsubps %xmm3,y0,y0 + + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/***************************************************************************************************************************/ + +.macro KERNEL8x2_1 + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + prefetcht0 A_PR1(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+64(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss 0 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss 1 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+128(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss 2 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss 3 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + vbroadcastss 4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + vbroadcastss 5 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + vmovups 32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + vmovups 40 * SIZE(AO, %rax, SIZE), %ymm1 + prefetcht0 A_PR1+192(AO, %rax, SIZE) + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss 6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss 7 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + addq $ 16, BI + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + addq $ 64, %rax +.endm + + +.macro KERNEL8x2_SUB + + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 + + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + + + VFMADDPS_YR( %ymm10,%ymm6,%ymm0 ) + VFMADDPS_YI( %ymm11,%ymm7,%ymm0 ) + VFMADDPS_YR( %ymm14,%ymm6,%ymm1 ) + VFMADDPS_YI( %ymm15,%ymm7,%ymm1 ) + + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm10, %ymm11,%ymm11 + vaddsubps %ymm12, %ymm13,%ymm13 + vaddsubps %ymm14, %ymm15,%ymm15 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm11, %ymm10 + vmovaps %ymm13, %ymm12 + vmovaps %ymm15, %ymm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm10, %ymm0, %ymm10 + vmulps %ymm12, %ymm0, %ymm12 + vmulps %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm11, %ymm1, %ymm11 + vmulps %ymm13, %ymm1, %ymm13 + vmulps %ymm15, %ymm1, %ymm15 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm11,%ymm10, %ymm10 + vaddsubps %ymm13,%ymm12, %ymm12 + vaddsubps %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + + vaddps (CO1, LDC), %ymm10, %ymm10 + vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 8 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + vaddsubps %xmm12, %xmm13,%xmm13 + vaddsubps %xmm14, %xmm15,%xmm15 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + vmovaps %xmm13, %xmm12 + vmovaps %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + vmulps %xmm12, %xmm0, %xmm12 + vmulps %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + vmulps %xmm13, %xmm1, %xmm13 + vmulps %xmm15, %xmm1, %xmm15 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + vaddsubps %xmm13,%xmm12, %xmm12 + vaddsubps %xmm15,%xmm14, %xmm14 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + + vaddps (CO1, LDC), %xmm10, %xmm10 + vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 4 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + + vmovsd (CO1, LDC), %xmm15 + vaddps %xmm15, %xmm10, %xmm10 + +#endif + + vmovsd %xmm8 , (CO1) + vmovsd %xmm10 , (CO1, LDC) + +.endm + +/************************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 + VFMADDPS_YR( %ymm8,%ymm4,%ymm0 ) + VFMADDPS_YR( %ymm12,%ymm4,%ymm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPS_YI( %ymm9,%ymm5,%ymm0 ) + VFMADDPS_YI( %ymm13,%ymm5,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA_R, %ymm0 + vbroadcastss ALPHA_I, %ymm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 + vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 + +#else + vaddsubps %ymm8, %ymm9 ,%ymm9 + vaddsubps %ymm12, %ymm13,%ymm13 + + vmovaps %ymm9, %ymm8 + vmovaps %ymm13, %ymm12 + + // swap high and low 64 bytes + vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 + vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulps %ymm8 , %ymm0, %ymm8 + vmulps %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulps %ymm9 , %ymm1, %ymm9 + vmulps %ymm13, %ymm1, %ymm13 + + vaddsubps %ymm9, %ymm8 , %ymm8 + vaddsubps %ymm13,%ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %ymm8 , %ymm8 + vaddps 8 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 8 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm12, %xmm13,%xmm13 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm13, %xmm12 + + // swap high and low 4 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm13, %xmm1, %xmm13 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm13,%xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + vaddps 4 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 4 * SIZE(CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + + vmovaps %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + + vaddsubps %xmm9, %xmm8 , %xmm8 + +#ifndef TRMMKERNEL + + vmovsd (CO1), %xmm14 + vaddps %xmm14, %xmm8 , %xmm8 + +#endif + + vmovsd %xmm8 , (CO1) + +.endm + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA_R + vmovss %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L2_4_10 + + ALIGN_4 +/**********************************************************************************************************/ + +.L2_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + je .L2_8_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x2_1 + + je .L2_8_16 + + jmp .L2_8_12 + ALIGN_4 + +.L2_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_8_17: + + KERNEL8x2_SUB + + jl .L2_8_17 + ALIGN_4 + + +.L2_8_19: + + SAVE8x2 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_8_11 + ALIGN_4 + + +/**********************************************************************************************************/ + + + + +.L2_4_10: + testq $ 7, M + jz .L2_4_60 // to next 2 lines of N + + testq $ 4, M + jz .L2_4_20 + ALIGN_4 + + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L2_4_20: + + testq $ 2, M + jz .L2_4_40 + ALIGN_4 + +.L2_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_4_26 + + jmp .L2_4_22 + ALIGN_4 + +.L2_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_27: + + KERNEL2x2_SUB + + jl .L2_4_27 + ALIGN_4 + + +.L2_4_29: + + vbroadcastss ALPHA_R, %xmm0 + vbroadcastss ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 + vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 + +#else + vaddsubps %xmm8, %xmm9 ,%xmm9 + vaddsubps %xmm10, %xmm11,%xmm11 + + vmovaps %xmm9, %xmm8 + vmovaps %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 + vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulps %xmm8 , %xmm0, %xmm8 + vmulps %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulps %xmm9 , %xmm1, %xmm9 + vmulps %xmm11, %xmm1, %xmm11 + + vaddsubps %xmm9, %xmm8 , %xmm8 + vaddsubps %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddps (CO1), %xmm8 , %xmm8 + + vaddps (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + + vmovups %xmm10 , (CO1, LDC) + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_4_21 + ALIGN_4 + + + +/**************************************************************************/ +.L2_4_40: + testq $ 1, M + jz .L2_4_60 // to next 2 lines of N + + ALIGN_4 + +.L2_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_4_46 + + jmp .L2_4_42 + ALIGN_4 + +.L2_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_47: + + KERNEL1x2_SUB + + jl .L2_4_47 + ALIGN_4 + + +.L2_4_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_4_41 + ALIGN_4 + + + + +.L2_4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $ 3, I // i = (m >> 3) + je .L1_4_10 + + ALIGN_4 + +/**************************************************************************************************/ + +.L1_8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 8, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_8_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL8x1_SUB + + je .L1_8_16 + + jmp .L1_8_12 + ALIGN_4 + +.L1_8_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_8_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_8_17: + + KERNEL8x1_SUB + + jl .L1_8_17 + ALIGN_4 + + +.L1_8_19: + + SAVE8x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 4, %rax // rax = rax *16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 8, KK +#endif + + addq $ 16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_8_11 + ALIGN_4 + + + +/**************************************************************************************************/ +.L1_4_10: + + testq $ 7, M + jz .L999 + + testq $ 4, M + jz .L1_4_20 + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/************************************************************************** +* Rest of M +***************************************************************************/ + +.L1_4_20: + + testq $ 2, M + jz .L1_4_40 + ALIGN_4 + +.L1_4_21: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_26 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_22: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_4_26 + + jmp .L1_4_22 + ALIGN_4 + +.L1_4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_29 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_27: + + KERNEL2x1_SUB + + jl .L1_4_27 + ALIGN_4 + + +.L1_4_29: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + + +/**************************************************************************/ +.L1_4_40: + testq $ 1, M + jz .L999 // to next 2 lines of N + + ALIGN_4 + +.L1_4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_4_46 + + jmp .L1_4_42 + ALIGN_4 + +.L1_4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_47: + + KERNEL1x1_SUB + + jl .L1_4_47 + ALIGN_4 + + +.L1_4_49: + + SAVE1x1 + + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index dc3f688c69..95a99b8b97 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -25,10 +25,25 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "cscal_microk_skylakex-2.c" +#elif defined(HASWELL) || defined(ZEN) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c new file mode 100644 index 0000000000..8a622427bb --- /dev/null +++ b/kernel/x86_64/cscal_microk_skylakex-2.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_16 1 + +static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + /* _mm512_addsub_ps does not exist so we flip signs for odd elements of da_i */ + __m512 da_r = _mm512_set1_ps(alpha[0]); + __m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1); + for (; i < n2; i += 32) { + __m512 x0 = _mm512_loadu_ps(&x[i + 0]); + __m512 x1 = _mm512_loadu_ps(&x[i + 16]); + __m512 y0 = _mm512_permute_ps(x0, 0xb1); + __m512 y1 = _mm512_permute_ps(x1, 0xb1); + _mm512_storeu_ps(&x[i + 0], _mm512_add_ps(da_r * x0, da_i * y0)); + _mm512_storeu_ps(&x[i + 16], _mm512_add_ps(da_r * x1, da_i * y1)); + } +#else + __m256 da_r = _mm256_set1_ps(alpha[0]); + __m256 da_i = _mm256_set1_ps(alpha[1]); + for (; i < n2; i += 32) { + __m256 x0 = _mm256_loadu_ps(&x[i + 0]); + __m256 x1 = _mm256_loadu_ps(&x[i + 8]); + __m256 x2 = _mm256_loadu_ps(&x[i + 16]); + __m256 x3 = _mm256_loadu_ps(&x[i + 24]); + __m256 y0 = _mm256_permute_ps(x0, 0xb1); + __m256 y1 = _mm256_permute_ps(x1, 0xb1); + __m256 y2 = _mm256_permute_ps(x2, 0xb1); + __m256 y3 = _mm256_permute_ps(x3, 0xb1); + _mm256_storeu_ps(&x[i + 0], _mm256_addsub_ps(da_r * x0, da_i * y0)); + _mm256_storeu_ps(&x[i + 8], _mm256_addsub_ps(da_r * x1, da_i * y1)); + _mm256_storeu_ps(&x[i + 16], _mm256_addsub_ps(da_r * x2, da_i * y2)); + _mm256_storeu_ps(&x[i + 24], _mm256_addsub_ps(da_r * x3, da_i * y3)); + } +#endif +} + + +static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1); + for (; i < n2; i += 32) { + __m512 y0 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 0]), 0xb1); + __m512 y1 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 16]), 0xb1); + _mm512_storeu_ps(&x[i + 0], da_i * y0); + _mm512_storeu_ps(&x[i + 16], da_i * y1); + } +#else + __m256 da_i = _mm256_set1_ps(alpha[1]) * _mm256_set_ps(1, -1, 1, -1, 1, -1, 1, -1); + for (; i < n2; i += 32) { + __m256 y0 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 0]), 0xb1); + __m256 y1 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 8]), 0xb1); + __m256 y2 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 16]), 0xb1); + __m256 y3 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 24]), 0xb1); + _mm256_storeu_ps(&x[i + 0], da_i * y0); + _mm256_storeu_ps(&x[i + 8], da_i * y1); + _mm256_storeu_ps(&x[i + 16], da_i * y2); + _mm256_storeu_ps(&x[i + 24], da_i * y3); + } +#endif +} + + +static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512 da_r = _mm512_set1_ps(alpha[0]); + for (; i < n2; i += 32) { + _mm512_storeu_ps(&x[i + 0], da_r * _mm512_loadu_ps(&x[i + 0])); + _mm512_storeu_ps(&x[i + 16], da_r * _mm512_loadu_ps(&x[i + 16])); + } +#else + __m256 da_r = _mm256_set1_ps(alpha[0]); + for (; i < n2; i += 32) { + _mm256_storeu_ps(&x[i + 0], da_r * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&x[i + 8], da_r * _mm256_loadu_ps(&x[i + 8])); + _mm256_storeu_ps(&x[i + 16], da_r * _mm256_loadu_ps(&x[i + 16])); + _mm256_storeu_ps(&x[i + 24], da_r * _mm256_loadu_ps(&x[i + 24])); + } +#endif +} + + +static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512 zero = _mm512_setzero_ps(); + for (; i < n2; i += 32) { + _mm512_storeu_ps(&x[i], zero); + _mm512_storeu_ps(&x[i + 16], zero); + } +#else + __m256 zero = _mm256_setzero_ps(); + for (; i < n2; i += 32) { + _mm256_storeu_ps(&x[i + 0], zero); + _mm256_storeu_ps(&x[i + 8], zero); + _mm256_storeu_ps(&x[i + 16], zero); + _mm256_storeu_ps(&x[i + 24], zero); + } +#endif + +} + +#else +#include "cscal_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S index 98b582c0d6..899c5f2419 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S +++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S @@ -1,5215 +1,5215 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/10/20 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK - -* -* -* 2013/10/20 Saar -* Parameter: -* DGEMM_DEFAULT_UNROLL_N 2 -* DGEMM_DEFAULT_UNROLL_M 16 -* DGEMM_DEFAULT_P 192 -* DGEMM_DEFAULT_Q 128 -* A_PR1 512 -* -* -* Performance without prefetch of B: -* 1 thread: 45.8 GFLOPS (MKL: 45) -* 2 threads: 80.0 GFLOPS (MKL: 91) -* 4 threads: 135.0 GFLOPS (MKL: 135) -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 512*8*4 -#define LB2_OFFSET 512*8*2 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -.macro VFMADD231PD_ y0,y1,y2 - vfmaddpd \y0,\y1,\y2,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmaddsd \x0,\x1,\x2,\x0 -.endm - -#else - -.macro VFMADD231PD_ y0,y1,y2 - vfmadd231pd \y2,\y1,\y0 -.endm - -.macro VFMADD231SD_ x0,x1,x2 - vfmadd231sd \x2,\x1,\x0 -.endm - -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -.macro KERNEL16x3_SUBN - prefetcht0 A_PR1(AO) - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovaps -12 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 A_PR1+64(AO) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovaps -8 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovaps -4 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $ 3*SIZE , BO - addq $ 16*SIZE, AO -.endm - - -.macro KERNEL8x3_SUBN - //prefetcht0 A_PR1(AO) - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovaps -12 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - //prefetcht0 A_PR1+64(AO) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - prefetcht0 B_PR1(BO) - addq $ 3*SIZE , BO - addq $ 8*SIZE, AO -.endm - -.macro KERNEL4x3_SUBN - vbroadcastsd -12 * SIZE(BO), %ymm1 - vmovaps -16 * SIZE(AO), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -10 * SIZE(BO), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $ 3*SIZE , BO - addq $ 4*SIZE, AO -.endm - -.macro KERNEL2x3_SUBN - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -10 * SIZE(BO), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -15 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $ 3*SIZE , BO - addq $ 2*SIZE, AO -.endm - -.macro KERNEL1x3_SUBN - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -10 * SIZE(BO), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $ 3*SIZE , BO - addq $ 1*SIZE, AO -.endm - - - - - - -/******************************************************************************************/ - -.macro KERNEL16x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - - - - -.macro KERNEL16x3_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - prefetcht0 A_PR1+64(AO,%rax,SIZE) - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - addq $12, BI - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $64, %rax - VFMADD231PD_ %ymm15,%ymm3,%ymm0 -.endm - -.macro KERNEL16x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - VFMADD231PD_ %ymm12,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - VFMADD231PD_ %ymm15,%ymm3,%ymm0 - addq $3 , BI - addq $16, %rax -.endm - -.macro SAVE16x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm15, %ymm15 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 - vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) - vmovups %ymm15,12 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 -.endm - -.macro KERNEL8x3_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $12, BI - addq $32, %rax -.endm - -.macro KERNEL8x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - VFMADD231PD_ %ymm9,%ymm3,%ymm0 - addq $3 , BI - addq $8 , %rax -.endm - -.macro SAVE8x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm9 , %ymm9 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - - vmovups %ymm6 , (CO1, LDC, 2) - vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_2 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_3 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 -.endm - -.macro KERNEL4x3_4 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $12, BI - addq $16, %rax -.endm - -.macro KERNEL4x3_SUB - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PD_ %ymm6,%ymm3,%ymm0 - addq $3 , BI - addq $4 , %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd (CO1, LDC, 2), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x3_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 -.endm - -.macro KERNEL2x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $12, BI - addq $8, %rax -.endm - -.macro KERNEL2x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - VFMADD231SD_ %xmm12,%xmm3,%xmm0 - addq $3 , BI - addq $2 , %rax -.endm - -.macro SAVE2x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm12, %xmm12 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) - -.endm - -/*******************************************************************************************/ - -.macro KERNEL1x3_1 - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_2 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 -.endm - -.macro KERNEL1x3_4 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $12, BI - addq $4, %rax -.endm - -.macro KERNEL1x3_SUB - vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SD_ %xmm6,%xmm3,%xmm0 - addq $3 , BI - addq $1 , %rax -.endm - -.macro SAVE1x3 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd (CO1, LDC, 2), %xmm6,%xmm6 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_2 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_3 - prefetcht0 256+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 320+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 -.endm - -.macro KERNEL16x2_4 - prefetcht0 384+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - prefetcht0 448+A_PR1(AO, %rax, SIZE) - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $8, BI - addq $64, %rax -.endm - -.macro KERNEL16x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - VFMADD231PD_ %ymm11,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - VFMADD231PD_ %ymm14,%ymm2,%ymm0 - addq $2, BI - addq $16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm11, %ymm11 - vmulpd %ymm0 , %ymm14, %ymm14 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 - vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - vmovups %ymm11, 8 * SIZE(CO1, LDC) - vmovups %ymm14,12 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_2 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_3 - prefetcht0 128+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 -.endm - -.macro KERNEL8x2_4 - prefetcht0 192+A_PR1(AO, %rax, SIZE) - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $8, BI - addq $32, %rax -.endm - -.macro KERNEL8x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - VFMADD231PD_ %ymm8,%ymm2,%ymm0 - addq $2, BI - addq $8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm8 , %ymm8 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - - vaddpd (CO1, LDC), %ymm5,%ymm5 - vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm8 , 4 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_2 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_3 - prefetcht0 64+A_PR1(AO, %rax, SIZE) - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 -.endm - -.macro KERNEL4x2_4 - vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $8, BI - addq $16, %rax -.endm - -.macro KERNEL4x2_SUB - vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PD_ %ymm5,%ymm2,%ymm0 - addq $2, BI - addq $4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd (CO1, LDC), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_1 - prefetcht0 A_PR1(AO, %rax, SIZE) - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 -.endm - -.macro KERNEL2x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $8, BI - addq $8, %rax -.endm - -.macro KERNEL2x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - VFMADD231SD_ %xmm10,%xmm2,%xmm0 - addq $2, BI - addq $2, %rax -.endm - -.macro SAVE2x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm10, %xmm10 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - vaddsd (CO1, LDC), %xmm5,%xmm5 - vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm10, 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_1 - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_2 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 -.endm - -.macro KERNEL1x2_4 - vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $8, BI - addq $4, %rax -.endm - -.macro KERNEL1x2_SUB - vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SD_ %xmm5,%xmm2,%xmm0 - addq $2, BI - addq $1, %rax -.endm - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd (CO1, LDC), %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 -.endm - -.macro KERNEL16x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $4, BI - addq $64, %rax -.endm - -.macro KERNEL16x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm10,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm13,%ymm1,%ymm0 - addq $1, BI - addq $16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm13, %ymm13 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 - vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - vmovups %ymm10, 8 * SIZE(CO1) - vmovups %ymm13,12 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 -.endm - -.macro KERNEL8x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $4, BI - addq $32, %rax -.endm - -.macro KERNEL8x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm7,%ymm1,%ymm0 - addq $1, BI - addq $8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm7 , 4 * SIZE(CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_1 - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_2 - vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_3 - vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 -.endm - -.macro KERNEL4x1_4 - vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $4, BI - addq $16, %rax -.endm - -.macro KERNEL4x1_SUB - vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 - vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 - VFMADD231PD_ %ymm4,%ymm1,%ymm0 - addq $1, BI - addq $4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 -.endm - -.macro KERNEL2x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $4, BI - addq $8, %rax -.endm - -.macro KERNEL2x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm8,%xmm1,%xmm0 - addq $1, BI - addq $2 , %rax -.endm - -.macro SAVE2x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm8 , %xmm8 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm8 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_1 - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_2 - vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_3 - vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 -.endm - -.macro KERNEL1x1_4 - vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro KERNEL1x1_SUB - vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 - vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 - VFMADD231SD_ %xmm4,%xmm1,%xmm0 - addq $ 1, BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - - vmovups 0 * SIZE(BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm2 - vmovups 4 * SIZE(BO1), %xmm4 - vmovups 6 * SIZE(BO1), %xmm6 - vmovsd 0 * SIZE(BO2), %xmm1 - vmovsd 2 * SIZE(BO2), %xmm3 - vmovsd 4 * SIZE(BO2), %xmm5 - vmovsd 6 * SIZE(BO2), %xmm7 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovups 0 * SIZE(BO1), %xmm0 - vmovsd 0 * SIZE(BO2), %xmm2 - vmovups %xmm0, 0*SIZE(BO) - vmovsd %xmm2, 2*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovups 0 * SIZE(BO2), %xmm0 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups 4 * SIZE(BO2), %xmm4 - vmovups 6 * SIZE(BO2), %xmm6 - vmovsd 1 * SIZE(BO1), %xmm1 - vmovsd 3 * SIZE(BO1), %xmm3 - vmovsd 5 * SIZE(BO1), %xmm5 - vmovsd 7 * SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups 0*SIZE(BO2), %xmm1 - vmovsd %xmm0, 0*SIZE(BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - prefetcht0 (CO1) - prefetcht0 (CO1,LDC,1) - prefetcht0 (CO1,LDC,2) - prefetcht0 64(CO1) - prefetcht0 64(CO1,LDC,1) - prefetcht0 64(CO1,LDC,2) - - vzeroall - - movq K, %rax - - sarq $1, %rax // K / 8 - je .L6_16 - - ALIGN_5 - -.L6_12: -/* - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) -*/ - KERNEL16x3_SUBN - KERNEL16x3_SUBN -/* - KERNEL16x3_SUBN - KERNEL16x3_SUBN - - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN -*/ - dec %rax - jne .L6_12 - -.L6_16: - movq K, %rax - - andq $1, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUBN - - dec %rax - jne .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - dec %rax - jne .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUBN - - dec %rax - jne .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - dec %rax - jne .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUBN - - dec %rax - jne .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L6_36 - ALIGN_4 - -.L6_32: - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - dec %rax - jne .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUBN - - dec %rax - jne .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3,%rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - dec %rax - jne .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUBN - - dec %rax - jne .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - prefetcht0 (CO1) - prefetcht0 (CO1,LDC,1) - prefetcht0 (CO1,LDC,2) - prefetcht0 64(CO1) - prefetcht0 64(CO1,LDC,1) - prefetcht0 64(CO1,LDC,2) - - vzeroall - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L7_16 - ALIGN_5 - -.L7_12: -/* - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) -*/ - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - KERNEL16x3_SUBN - dec %rax - jne .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_5 - -.L7_17: - - KERNEL16x3_SUBN - - dec %rax - jne .L7_17 - - -.L7_19: - - SAVE16x3 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - KERNEL8x3_SUBN - - dec %rax - jne .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUBN - - dec %rax - jne .L7_20_7 - ALIGN_4 - -.L7_20_9: - - SAVE8x3 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - KERNEL4x3_SUBN - - dec %rax - jne .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUBN - - dec %rax - jne .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x3 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - KERNEL2x3_SUBN - - dec %rax - jne .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUBN - - dec %rax - jne .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x3 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $12 * SIZE, BO - - vzeroall - - movq K, %rax - - sarq $3, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - KERNEL1x3_SUBN - - dec %rax - jne .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUBN - - dec %rax - jne .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x3 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm1 - vmovups 4*SIZE(BO1), %xmm2 - vmovups 6*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovups %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x2_1 - KERNEL16x2_2 - KERNEL16x2_3 - KERNEL16x2_4 - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1 - KERNEL8x2_2 - KERNEL8x2_3 - KERNEL8x2_4 - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_3 - KERNEL4x2_4 - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_3 - KERNEL2x2_4 - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_3 - KERNEL1x2_4 - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - KERNEL16x1_1 - KERNEL16x1_2 - KERNEL16x1_3 - KERNEL16x1_4 - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - KERNEL8x1_1 - KERNEL8x1_2 - KERNEL8x1_3 - KERNEL8x1_4 - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_3 - KERNEL4x1_4 - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_3 - KERNEL2x1_4 - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_3 - KERNEL1x1_4 - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/10/20 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/20 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 2 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 192 +* DGEMM_DEFAULT_Q 128 +* A_PR1 512 +* +* +* Performance without prefetch of B: +* 1 thread: 45.8 GFLOPS (MKL: 45) +* 2 threads: 80.0 GFLOPS (MKL: 91) +* 4 threads: 135.0 GFLOPS (MKL: 135) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 512*8*4 +#define LB2_OFFSET 512*8*2 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +.macro VFMADD231PD_ y0,y1,y2 + vfmaddpd \y0,\y1,\y2,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmaddsd \x0,\x1,\x2,\x0 +.endm + +#else + +.macro VFMADD231PD_ y0,y1,y2 + vfmadd231pd \y2,\y1,\y0 +.endm + +.macro VFMADD231SD_ x0,x1,x2 + vfmadd231sd \x2,\x1,\x0 +.endm + +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +.macro KERNEL16x3_SUBN + prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovaps -8 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovaps -4 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 16*SIZE, AO +.endm + + +.macro KERNEL8x3_SUBN + //prefetcht0 A_PR1(AO) + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovaps -12 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + //prefetcht0 A_PR1+64(AO) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + prefetcht0 B_PR1(BO) + addq $ 3*SIZE , BO + addq $ 8*SIZE, AO +.endm + +.macro KERNEL4x3_SUBN + vbroadcastsd -12 * SIZE(BO), %ymm1 + vmovaps -16 * SIZE(AO), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -10 * SIZE(BO), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $ 3*SIZE , BO + addq $ 4*SIZE, AO +.endm + +.macro KERNEL2x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -15 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 2*SIZE, AO +.endm + +.macro KERNEL1x3_SUBN + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -10 * SIZE(BO), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $ 3*SIZE , BO + addq $ 1*SIZE, AO +.endm + + + + + + +/******************************************************************************************/ + +.macro KERNEL16x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + + + + +.macro KERNEL16x3_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + prefetcht0 A_PR1+64(AO,%rax,SIZE) + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + addq $12, BI + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $64, %rax + VFMADD231PD_ %ymm15,%ymm3,%ymm0 +.endm + +.macro KERNEL16x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + VFMADD231PD_ %ymm12,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + VFMADD231PD_ %ymm15,%ymm3,%ymm0 + addq $3 , BI + addq $16, %rax +.endm + +.macro SAVE16x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm15, %ymm15 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12 + vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %ymm12, 8 * SIZE(CO1, LDC, 2) + vmovups %ymm15,12 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 +.endm + +.macro KERNEL8x3_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $12, BI + addq $32, %rax +.endm + +.macro KERNEL8x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + VFMADD231PD_ %ymm9,%ymm3,%ymm0 + addq $3 , BI + addq $8 , %rax +.endm + +.macro SAVE8x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm9 , %ymm9 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + + vmovups %ymm6 , (CO1, LDC, 2) + vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_2 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_3 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 +.endm + +.macro KERNEL4x3_4 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $12, BI + addq $16, %rax +.endm + +.macro KERNEL4x3_SUB + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PD_ %ymm6,%ymm3,%ymm0 + addq $3 , BI + addq $4 , %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd (CO1, LDC, 2), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x3_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 +.endm + +.macro KERNEL2x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $12, BI + addq $8, %rax +.endm + +.macro KERNEL2x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + VFMADD231SD_ %xmm12,%xmm3,%xmm0 + addq $3 , BI + addq $2 , %rax +.endm + +.macro SAVE2x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm12, %xmm12 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) + +.endm + +/*******************************************************************************************/ + +.macro KERNEL1x3_1 + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_2 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 +.endm + +.macro KERNEL1x3_4 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $12, BI + addq $4, %rax +.endm + +.macro KERNEL1x3_SUB + vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SD_ %xmm6,%xmm3,%xmm0 + addq $3 , BI + addq $1 , %rax +.endm + +.macro SAVE1x3 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd (CO1, LDC, 2), %xmm6,%xmm6 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_2 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_3 + prefetcht0 256+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 320+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 +.endm + +.macro KERNEL16x2_4 + prefetcht0 384+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + prefetcht0 448+A_PR1(AO, %rax, SIZE) + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $8, BI + addq $64, %rax +.endm + +.macro KERNEL16x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + VFMADD231PD_ %ymm11,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + VFMADD231PD_ %ymm14,%ymm2,%ymm0 + addq $2, BI + addq $16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm11, %ymm11 + vmulpd %ymm0 , %ymm14, %ymm14 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 + vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + vmovups %ymm11, 8 * SIZE(CO1, LDC) + vmovups %ymm14,12 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_2 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_3 + prefetcht0 128+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 +.endm + +.macro KERNEL8x2_4 + prefetcht0 192+A_PR1(AO, %rax, SIZE) + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $8, BI + addq $32, %rax +.endm + +.macro KERNEL8x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + VFMADD231PD_ %ymm8,%ymm2,%ymm0 + addq $2, BI + addq $8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm8 , %ymm8 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + + vaddpd (CO1, LDC), %ymm5,%ymm5 + vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm8 , 4 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_2 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_3 + prefetcht0 64+A_PR1(AO, %rax, SIZE) + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 +.endm + +.macro KERNEL4x2_4 + vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $8, BI + addq $16, %rax +.endm + +.macro KERNEL4x2_SUB + vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PD_ %ymm5,%ymm2,%ymm0 + addq $2, BI + addq $4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd (CO1, LDC), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_1 + prefetcht0 A_PR1(AO, %rax, SIZE) + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 +.endm + +.macro KERNEL2x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $8, BI + addq $8, %rax +.endm + +.macro KERNEL2x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + VFMADD231SD_ %xmm10,%xmm2,%xmm0 + addq $2, BI + addq $2, %rax +.endm + +.macro SAVE2x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm10, %xmm10 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + vaddsd (CO1, LDC), %xmm5,%xmm5 + vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm10, 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_1 + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_2 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 +.endm + +.macro KERNEL1x2_4 + vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $8, BI + addq $4, %rax +.endm + +.macro KERNEL1x2_SUB + vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SD_ %xmm5,%xmm2,%xmm0 + addq $2, BI + addq $1, %rax +.endm + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd (CO1, LDC), %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 +.endm + +.macro KERNEL16x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $4, BI + addq $64, %rax +.endm + +.macro KERNEL16x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm10,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm13,%ymm1,%ymm0 + addq $1, BI + addq $16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm13, %ymm13 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 + vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + vmovups %ymm10, 8 * SIZE(CO1) + vmovups %ymm13,12 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 +.endm + +.macro KERNEL8x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $4, BI + addq $32, %rax +.endm + +.macro KERNEL8x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm7,%ymm1,%ymm0 + addq $1, BI + addq $8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm7 , 4 * SIZE(CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_1 + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_2 + vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_3 + vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 +.endm + +.macro KERNEL4x1_4 + vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $4, BI + addq $16, %rax +.endm + +.macro KERNEL4x1_SUB + vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 + vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 + VFMADD231PD_ %ymm4,%ymm1,%ymm0 + addq $1, BI + addq $4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 +.endm + +.macro KERNEL2x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $4, BI + addq $8, %rax +.endm + +.macro KERNEL2x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm8,%xmm1,%xmm0 + addq $1, BI + addq $2 , %rax +.endm + +.macro SAVE2x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm8 , %xmm8 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm8 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_1 + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_2 + vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_3 + vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 +.endm + +.macro KERNEL1x1_4 + vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro KERNEL1x1_SUB + vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 + vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 + VFMADD231SD_ %xmm4,%xmm1,%xmm0 + addq $ 1, BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + vmovups 0 * SIZE(BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm2 + vmovups 4 * SIZE(BO1), %xmm4 + vmovups 6 * SIZE(BO1), %xmm6 + vmovsd 0 * SIZE(BO2), %xmm1 + vmovsd 2 * SIZE(BO2), %xmm3 + vmovsd 4 * SIZE(BO2), %xmm5 + vmovsd 6 * SIZE(BO2), %xmm7 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovups 0 * SIZE(BO1), %xmm0 + vmovsd 0 * SIZE(BO2), %xmm2 + vmovups %xmm0, 0*SIZE(BO) + vmovsd %xmm2, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovups 0 * SIZE(BO2), %xmm0 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups 4 * SIZE(BO2), %xmm4 + vmovups 6 * SIZE(BO2), %xmm6 + vmovsd 1 * SIZE(BO1), %xmm1 + vmovsd 3 * SIZE(BO1), %xmm3 + vmovsd 5 * SIZE(BO1), %xmm5 + vmovsd 7 * SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups 0*SIZE(BO2), %xmm1 + vmovsd %xmm0, 0*SIZE(BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $1, %rax // K / 8 + je .L6_16 + + ALIGN_5 + +.L6_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN +/* + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN +*/ + dec %rax + jne .L6_12 + +.L6_16: + movq K, %rax + + andq $1, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + dec %rax + jne .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + dec %rax + jne .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L6_36 + ALIGN_4 + +.L6_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + dec %rax + jne .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3,%rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + prefetcht0 (CO1) + prefetcht0 (CO1,LDC,1) + prefetcht0 (CO1,LDC,2) + prefetcht0 64(CO1) + prefetcht0 64(CO1,LDC,1) + prefetcht0 64(CO1,LDC,2) + + vzeroall + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L7_16 + ALIGN_5 + +.L7_12: +/* + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) +*/ + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + KERNEL16x3_SUBN + dec %rax + jne .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_5 + +.L7_17: + + KERNEL16x3_SUBN + + dec %rax + jne .L7_17 + + +.L7_19: + + SAVE16x3 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUBN + + dec %rax + jne .L7_20_7 + ALIGN_4 + +.L7_20_9: + + SAVE8x3 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + KERNEL4x3_SUBN + + dec %rax + jne .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUBN + + dec %rax + jne .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x3 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + KERNEL2x3_SUBN + + dec %rax + jne .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUBN + + dec %rax + jne .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x3 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $12 * SIZE, BO + + vzeroall + + movq K, %rax + + sarq $3, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + KERNEL1x3_SUBN + + dec %rax + jne .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUBN + + dec %rax + jne .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x3 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm1 + vmovups 4*SIZE(BO1), %xmm2 + vmovups 6*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovups %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x2_1 + KERNEL16x2_2 + KERNEL16x2_3 + KERNEL16x2_4 + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_3 + KERNEL8x2_4 + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_3 + KERNEL4x2_4 + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_3 + KERNEL2x2_4 + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_3 + KERNEL1x2_4 + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + KERNEL16x1_1 + KERNEL16x1_2 + KERNEL16x1_3 + KERNEL16x1_4 + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_3 + KERNEL8x1_4 + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_3 + KERNEL4x1_4 + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_3 + KERNEL2x1_4 + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_3 + KERNEL1x1_4 + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S index 0a2ca7ae37..29501df8e3 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S @@ -1,3494 +1,3494 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -/********************************************************************* -* 2013/10/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK - -* -* -* 2013/10/27 Saar -* Parameter: -* DGEMM_DEFAULT_UNROLL_N 4 -* DGEMM_DEFAULT_UNROLL_M 4 -* DGEMM_DEFAULT_P 512 -* DGEMM_DEFAULT_Q 256 -* A_PR1 512 -* B_PR1 512 -* -* -* Performance at 9216x9216x9216: -* 1 thread: 53.3 GFLOPS (MKL: 54) -* 2 threads: 100.0 GFLOPS (MKL: 97) -* 3 threads: 147.0 GFLOPS (MKL: 133) -* 4 threads: 184.0 GFLOPS (MKL: 170) -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 32(CO1) - prefetcht0 32(CO1,LDC) - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - - vpermpd $ 0xb1 , %ymm9 , %ymm9 - vpermpd $ 0xb1 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - vpermpd $ 0xb1 , %ymm13, %ymm13 - vpermpd $ 0xb1 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 32(%rax) - prefetcht0 32(%rax,LDC) - prefetcht0 32(%rbp) - prefetcht0 32(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E - vmovups -12 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO - vpermpd $ 0x1b, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO - vpermpd $ 0xb1, %ymm0 , %ymm0 - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - - vpermpd $ 0xb1 , %ymm5, %ymm5 - vpermpd $ 0xb1 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vpermpd $ 0x1b , %ymm2, %ymm2 - vpermpd $ 0x1b , %ymm3, %ymm3 - vpermpd $ 0xb1 , %ymm2, %ymm2 - vpermpd $ 0xb1 , %ymm3, %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv12 // N / 12 - movq %rdx, Nmod12 // N % 12 - - - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $2,%rax // K * 4 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $1 , %rax // K / 2 - jz .L12_01a_2 - ALIGN_4 - -.L12_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetcht0 512(BO3) - prefetchw 512(BO) - - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm5 - vmovups 0 * SIZE(BO2), %ymm2 - vmovups 4 * SIZE(BO2), %ymm6 - vmovups 0 * SIZE(BO3), %ymm3 - vmovups 4 * SIZE(BO3), %ymm7 - - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - - vmovups %ymm5, 12 * SIZE(BO) - vmovups %ymm6, 16 * SIZE(BO) - vmovups %ymm7, 20 * SIZE(BO) - - addq $ 8 * SIZE ,BO1 - addq $ 8 * SIZE ,BO2 - addq $ 8 * SIZE ,BO3 - addq $ 24 *SIZE ,BO - - decq %rax - jnz .L12_01a_1 - - - -.L12_01a_2: - - movq K, %rax - andq $1, %rax // K % 2 - jz .L12_03c - ALIGN_4 - - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 0 * SIZE(BO2), %ymm2 - vmovups 0 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 4*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - movq BO3, B // next offset of B - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - decq J // j -- - jg .L12_01 - - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $2, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L4_10 - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv12 // N / 4 - movq %rdx, Nmod12 // N % 4 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - - - movq Ndiv12, J - cmpq $ 0, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L4_10 - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +/********************************************************************* +* 2013/10/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK + +* +* +* 2013/10/27 Saar +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 4 +* DGEMM_DEFAULT_P 512 +* DGEMM_DEFAULT_Q 256 +* A_PR1 512 +* B_PR1 512 +* +* +* Performance at 9216x9216x9216: +* 1 thread: 53.3 GFLOPS (MKL: 54) +* 2 threads: 100.0 GFLOPS (MKL: 97) +* 3 threads: 147.0 GFLOPS (MKL: 133) +* 4 threads: 184.0 GFLOPS (MKL: 170) +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 32(CO1) + prefetcht0 32(CO1,LDC) + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + + vpermpd $ 0xb1 , %ymm9 , %ymm9 + vpermpd $ 0xb1 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + vpermpd $ 0xb1 , %ymm13, %ymm13 + vpermpd $ 0xb1 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 32(%rax) + prefetcht0 32(%rax,LDC) + prefetcht0 32(%rbp) + prefetcht0 32(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E + vmovups -12 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO + vpermpd $ 0x1b, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO + vpermpd $ 0xb1, %ymm0 , %ymm0 + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + + vpermpd $ 0xb1 , %ymm5, %ymm5 + vpermpd $ 0xb1 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vpermpd $ 0x1b , %ymm2, %ymm2 + vpermpd $ 0x1b , %ymm3, %ymm3 + vpermpd $ 0xb1 , %ymm2, %ymm2 + vpermpd $ 0xb1 , %ymm3, %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv12 // N / 12 + movq %rdx, Nmod12 // N % 12 + + + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $2,%rax // K * 4 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq (BO2,%rax, SIZE), BO3 // next offset to BO2 + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $1 , %rax // K / 2 + jz .L12_01a_2 + ALIGN_4 + +.L12_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetcht0 512(BO3) + prefetchw 512(BO) + + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm5 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 4 * SIZE(BO2), %ymm6 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups 4 * SIZE(BO3), %ymm7 + + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + + vmovups %ymm5, 12 * SIZE(BO) + vmovups %ymm6, 16 * SIZE(BO) + vmovups %ymm7, 20 * SIZE(BO) + + addq $ 8 * SIZE ,BO1 + addq $ 8 * SIZE ,BO2 + addq $ 8 * SIZE ,BO3 + addq $ 24 *SIZE ,BO + + decq %rax + jnz .L12_01a_1 + + + +.L12_01a_2: + + movq K, %rax + andq $1, %rax // K % 2 + jz .L12_03c + ALIGN_4 + + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 0 * SIZE(BO2), %ymm2 + vmovups 0 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 4*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + movq BO3, B // next offset of B + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + decq J // j -- + jg .L12_01 + + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $2, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv12 // N / 4 + movq %rdx, Nmod12 // N % 4 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + + + movq Ndiv12, J + cmpq $ 0, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L4_10 + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S index 19e32ef2c7..adaa28bbc4 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S +++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S @@ -1,5153 +1,5153 @@ -/********************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 -#define BO3 %rbp - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 -#define L_BUFFER_SIZE 256*8*12+4096 - -#else - -#define STACKSIZE 256 -#define L_BUFFER_SIZE 128*8*12+512 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - - -#define Ndiv12 24(%rsp) -#define Nmod12 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 160 -#define BROADCASTKERNEL - -/******************************************************************************************* -* Macro definitions -*******************************************************************************************/ - -.macro INIT4x12 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - vxorpd %ymm12, %ymm12, %ymm12 - vxorpd %ymm13, %ymm13, %ymm13 - vxorpd %ymm14, %ymm14, %ymm14 - vxorpd %ymm15, %ymm15, %ymm15 - -.endm - -.macro KERNEL4x12_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 - prefetcht0 B_PR1(BO) -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - prefetcht0 B_PR1+64(BO) - vmovups -8 * SIZE(BO), %ymm2 - prefetcht0 B_PR1+128(BO) - vmovups -4 * SIZE(BO), %ymm3 - vmulpd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+192(BO) - vmulpd %ymm0 ,%ymm2 , %ymm8 - vmulpd %ymm0 ,%ymm3 , %ymm12 - prefetcht0 B_PR1+256(BO) -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 - vmulpd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 12*SIZE, BO - vmulpd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M1 - prefetcht0 A_PR1(AO) -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - prefetcht0 B_PR1+128(BO) - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups -4 * SIZE(BO), %ymm3 - -.endm - -.macro KERNEL4x12_M2 -# if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -# else - vmovups -12 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups 0 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 4 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - vmovups 8 * SIZE(BO), %ymm3 - addq $ 24*SIZE, BO -.endm - - -.macro KERNEL4x12_E -# if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -# else - vmovups -12 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - addq $ 12*SIZE, BO -.endm - -.macro KERNEL4x12_SUB - vmovups -12 * SIZE(BO), %ymm1 -# if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -# else - vmovups -16 * SIZE(AO), %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 - vmovups -4 * SIZE(BO), %ymm3 - vfmadd231pd %ymm0 ,%ymm3 , %ymm12 -# if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 12*SIZE, BO - vfmadd231pd %ymm0 ,%ymm3 , %ymm13 -# if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -# else - vpermpd $ 0x1b, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO - vfmadd231pd %ymm0 ,%ymm3 , %ymm14 -# if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -# else - vpermilpd $ 0x05, %ymm0 , %ymm0 -# endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vfmadd231pd %ymm0 ,%ymm3 , %ymm15 - -.endm - - -.macro SAVE4x12 - - prefetcht0 BUFFER1 - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - prefetcht0 64 + BUFFER1 - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 -#if B_PR1 > 32 - prefetcht0 128 + BUFFER1 -#endif - vmulpd %ymm0 , %ymm12, %ymm12 - vmulpd %ymm0 , %ymm13, %ymm13 - vmulpd %ymm0 , %ymm14, %ymm14 - vmulpd %ymm0 , %ymm15, %ymm15 -#if B_PR1 > 96 - prefetcht0 192 + BUFFER1 -#endif - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 -#endif - -#if B_PR1 > 160 - prefetcht0 256 + BUFFER1 -#endif - -#if defined BROADCASTKERNEL - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 -#endif - -#if B_PR1 > 224 - prefetcht0 320 + BUFFER1 -#endif - -#ifndef BROADCASTKERNEL - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 -#endif - -#if B_PR1 > 288 - prefetcht0 384 + BUFFER1 -#endif - -#ifndef BROADCASTKERNEL - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - -#if B_PR1 > 352 - prefetcht0 448 + BUFFER1 -#endif - leaq (CO1, LDC, 2), %rax - -#if B_PR1 > 416 - prefetcht0 512 + BUFFER1 -#endif - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht1 56(CO1) - prefetcht1 56(CO1,LDC) - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 - vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 - vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 - vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm9, %ymm9 - vpermilpd $ 0x05 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 - vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - prefetcht1 56(%rbp) - prefetcht1 56(%rbp,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 - vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 - vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 - vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm13, %ymm13 - vpermilpd $ 0x05 , %ymm15, %ymm15 - - vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 - vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 - vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 - vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht1 56(%rax) - prefetcht1 56(%rax,LDC) - prefetcht1 56(%rbp) - prefetcht1 56(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL2x12_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vmovddup -4 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vmovddup -3 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - vmovddup -2 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm12 - vmovddup -1 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231pd %xmm0 ,%xmm2 , %xmm14 - addq $ 2*SIZE, AO - vfmadd231pd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE2x12 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - vmulpd %xmm0 , %xmm12, %xmm12 - vmulpd %xmm0 , %xmm13, %xmm13 - vmulpd %xmm0 , %xmm14, %xmm14 - vmulpd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm12, %xmm4 - vaddpd (%rax, LDC), %xmm13, %xmm5 - vaddpd (%rbp), %xmm14, %xmm6 - vaddpd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x12 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 - -.endm - -.macro KERNEL1x12_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vmovsd -4 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vmovsd -3 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - vmovsd -2 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm12 - vmovsd -1 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm13 - addq $ 12*SIZE, BO - vfmadd231sd %xmm0 ,%xmm2 , %xmm14 - addq $ 1*SIZE, AO - vfmadd231sd %xmm0 ,%xmm3 , %xmm15 - -.endm - -.macro SAVE1x12 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - vmulsd %xmm0 , %xmm12, %xmm12 - vmulsd %xmm0 , %xmm13, %xmm13 - vmulsd %xmm0 , %xmm14, %xmm14 - vmulsd %xmm0 , %xmm15, %xmm15 - - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - - leaq (%rax, LDC, 4), %rax - leaq (%rbp, LDC, 4), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm12, %xmm4 - vaddsd (%rax, LDC), %xmm13, %xmm5 - vaddsd (%rbp), %xmm14, %xmm6 - vaddsd (%rbp, LDC), %xmm15, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - -/******************************************************************************************/ - - -.macro INIT4x8 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - vxorpd %ymm8 , %ymm8 , %ymm8 - vxorpd %ymm9 , %ymm9 , %ymm9 - vxorpd %ymm10, %ymm10, %ymm10 - vxorpd %ymm11, %ymm11, %ymm11 - -.endm - -.macro KERNEL4x8_I - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vmovups -8 * SIZE(BO), %ymm2 - vmulpd %ymm0 ,%ymm1 , %ymm4 - vmulpd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm5 - vmulpd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - vmulpd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vmulpd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M1 - prefetcht0 A_PR1(AO) -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - prefetcht0 B_PR1(BO) - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - prefetcht0 B_PR1+64(BO) - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups -8 * SIZE(BO), %ymm2 - -.endm - -.macro KERNEL4x8_M2 -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -4 * SIZE(BO), %ymm1 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - vmovups 0 * SIZE(BO), %ymm2 - addq $ 16*SIZE, BO -.endm - - -.macro KERNEL4x8_E -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - addq $ 8*SIZE, BO -.endm - -.macro KERNEL4x8_SUB - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 - vmovups -8 * SIZE(BO), %ymm2 - vfmadd231pd %ymm0 ,%ymm2 , %ymm8 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - vfmadd231pd %ymm0 ,%ymm2 , %ymm9 - addq $ 8*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - vfmadd231pd %ymm0 ,%ymm2 , %ymm10 - addq $ 4*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vfmadd231pd %ymm0 ,%ymm2 , %ymm11 - -.endm - - -.macro SAVE4x8 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - vmulpd %ymm0 , %ymm7 , %ymm7 - - vmulpd %ymm0 , %ymm8 , %ymm8 - vmulpd %ymm0 , %ymm9 , %ymm9 - vmulpd %ymm0 , %ymm10, %ymm10 - vmulpd %ymm0 , %ymm11, %ymm11 - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - prefetcht0 56(CO1) - prefetcht0 56(CO1,LDC) - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 - vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 - vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 - vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm9 , %ymm9 - vpermilpd $ 0x05 , %ymm11, %ymm11 - - vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 - vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 - vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 - vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %ymm4, %ymm4 - vaddpd (%rax, LDC), %ymm5, %ymm5 - vaddpd (%rbp), %ymm6, %ymm6 - vaddpd (%rbp, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (%rax) - vmovups %ymm5 , (%rax, LDC) - vmovups %ymm6 , (%rbp) - vmovups %ymm7 , (%rbp, LDC) - - prefetcht0 56(%rax) - prefetcht0 56(%rax,LDC) - prefetcht0 56(%rbp) - prefetcht0 56(%rbp,LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ - -.macro INIT2x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL2x8_SUB - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -12 * SIZE(BO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm2 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -9 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -8 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vmovddup -7 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm1 , %xmm7 - vmovddup -6 * SIZE(BO), %xmm1 - vfmadd231pd %xmm0 ,%xmm2 , %xmm8 - vmovddup -5 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm3 , %xmm9 - vfmadd231pd %xmm0 ,%xmm1 , %xmm10 - vfmadd231pd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 2*SIZE, AO - -.endm - -.macro SAVE2x8 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - vmulpd %xmm0 , %xmm8 , %xmm8 - vmulpd %xmm0 , %xmm9 , %xmm9 - vmulpd %xmm0 , %xmm10, %xmm10 - vmulpd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddpd (%rax), %xmm8 , %xmm4 - vaddpd (%rax, LDC), %xmm9 , %xmm5 - vaddpd (%rbp), %xmm10, %xmm6 - vaddpd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovups %xmm4 , (%rax) - vmovups %xmm5 , (%rax, LDC) - vmovups %xmm6 , (%rbp) - vmovups %xmm7 , (%rbp, LDC) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ - -.macro INIT1x8 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - -.endm - -.macro KERNEL1x8_SUB - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -11 * SIZE(BO), %xmm2 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -9 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -8 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - vmovsd -7 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm1 , %xmm7 - vmovsd -6 * SIZE(BO), %xmm1 - vfmadd231sd %xmm0 ,%xmm2 , %xmm8 - vmovsd -5 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm3 , %xmm9 - vfmadd231sd %xmm0 ,%xmm1 , %xmm10 - vfmadd231sd %xmm0 ,%xmm2 , %xmm11 - addq $ 8*SIZE, BO - addq $ 1*SIZE, AO - -.endm - -.macro SAVE1x8 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - vmulsd %xmm0 , %xmm8 , %xmm8 - vmulsd %xmm0 , %xmm9 , %xmm9 - vmulsd %xmm0 , %xmm10, %xmm10 - vmulsd %xmm0 , %xmm11, %xmm11 - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - - leaq (%rax, LDC, 2), %rax - leaq (%rax, LDC, 2), %rbp - -#if !defined(TRMMKERNEL) - - vaddsd (%rax), %xmm8 , %xmm4 - vaddsd (%rax, LDC), %xmm9 , %xmm5 - vaddsd (%rbp), %xmm10, %xmm6 - vaddsd (%rbp, LDC), %xmm11, %xmm7 - -#endif - - vmovsd %xmm4 , (%rax) - vmovsd %xmm5 , (%rax, LDC) - vmovsd %xmm6 , (%rbp) - vmovsd %xmm7 , (%rbp, LDC) - - addq $ 1*SIZE, CO1 -.endm - - - - - -/******************************************************************************************/ - -.macro INIT4x4 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - -.macro KERNEL4x4_I - prefetcht0 A_PR1(AO) - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm6 - - addq $ 4*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vmulpd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M1 - prefetcht0 A_PR1(AO) -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 -#if defined BROADCASTKERNEL - vbroadcastsd -13 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -12 * SIZE(BO), %ymm1 - -.endm - -.macro KERNEL4x4_M2 -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - vmovups -8 * SIZE(BO), %ymm1 - addq $ 8*SIZE, BO -.endm - - -.macro KERNEL4x4_E -#if defined BROADCASTKERNEL - vbroadcastsd -12 * SIZE(AO), %ymm0 -#else - vmovups -12 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -11 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 -#if defined BROADCASTKERNEL - vbroadcastsd -10 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - - addq $ 8*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - addq $ 4*SIZE, BO -.endm - -.macro KERNEL4x4_SUB - vmovups -12 * SIZE(BO), %ymm1 -#if defined BROADCASTKERNEL - vbroadcastsd -16 * SIZE(AO), %ymm0 -#else - vmovups -16 * SIZE(AO), %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm4 -#if defined BROADCASTKERNEL - vbroadcastsd -15 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm5 - addq $ 4*SIZE, BO -#if defined BROADCASTKERNEL - vbroadcastsd -14 * SIZE(AO), %ymm0 -#else - vpermpd $ 0x1b, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm6 - addq $ 4*SIZE, AO -#if defined BROADCASTKERNEL - vbroadcastsd -17 * SIZE(AO), %ymm0 -#else - vpermilpd $ 0x05, %ymm0 , %ymm0 -#endif - vfmadd231pd %ymm0 ,%ymm1 , %ymm7 - -.endm - -.macro SAVE4x4 - - vbroadcastsd ALPHA, %ymm0 - - vmulpd %ymm0 , %ymm4 , %ymm4 - vmulpd %ymm0 , %ymm7 , %ymm7 - vmulpd %ymm0 , %ymm5 , %ymm5 - vmulpd %ymm0 , %ymm6 , %ymm6 - -#if defined BROADCASTKERNEL - vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 - vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 - vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 - vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 - vunpcklpd %ymm1, %ymm0, %ymm4 - vunpckhpd %ymm1, %ymm0, %ymm5 - vunpcklpd %ymm3, %ymm2, %ymm6 - vunpckhpd %ymm3, %ymm2, %ymm7 -#else - vpermilpd $ 0x05 , %ymm5, %ymm5 - vpermilpd $ 0x05 , %ymm7, %ymm7 - - vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 - vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 - vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 - vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 - - vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 - vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 - - vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 - vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 - vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 - vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 -#endif - - leaq (CO1, LDC, 2), %rax - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %ymm4, %ymm4 - vaddpd (CO1, LDC), %ymm5, %ymm5 - vaddpd (%rax), %ymm6, %ymm6 - vaddpd (%rax, LDC), %ymm7, %ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , (CO1, LDC) - vmovups %ymm6 , (%rax) - vmovups %ymm7 , (%rax, LDC) - - addq $ 4*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL2x4_SUB - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm2 - vfmadd231pd %xmm0 ,%xmm1 , %xmm4 - vmovddup -10 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm5 - vmovddup -9 * SIZE(BO), %xmm8 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231pd %xmm0 ,%xmm8 , %xmm7 - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x4 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddpd (CO1), %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm5, %xmm5 - vaddpd (%rax), %xmm6, %xmm6 - vaddpd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (%rax) - vmovups %xmm7 , (%rax, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x4 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL1x4_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vmovsd -10 * SIZE(BO), %xmm3 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - vmovsd -9 * SIZE(BO), %xmm8 - vfmadd231sd %xmm0 ,%xmm3 , %xmm6 - addq $ 4*SIZE, BO - vfmadd231sd %xmm0 ,%xmm8 , %xmm7 - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x4 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - vmulsd %xmm0 , %xmm6 , %xmm6 - vmulsd %xmm0 , %xmm7 , %xmm7 - - leaq (CO1, LDC, 2), %rax - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - vaddsd (%rax), %xmm6, %xmm6 - vaddsd (%rax, LDC), %xmm7, %xmm7 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (%rax) - vmovsd %xmm7 , (%rax, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - -.endm - - -.macro KERNEL4x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovups -14 * SIZE(AO), %xmm1 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm1 ,%xmm2 , %xmm5 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - vfmadd231pd %xmm1 ,%xmm3 , %xmm7 - addq $ 2*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm5 , %xmm5 - vmulpd %xmm0 , %xmm6 , %xmm6 - vmulpd %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 - vaddpd (CO1, LDC), %xmm6, %xmm6 - vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , 2 * SIZE(CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm7 , 2 * SIZE(CO1, LDC) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm6 , %xmm6 , %xmm6 - -.endm - - -.macro KERNEL2x2_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -11 * SIZE(BO), %xmm3 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - vfmadd231pd %xmm0 ,%xmm3 , %xmm6 - addq $ 2*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x2 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - vmulpd %xmm0 , %xmm6 , %xmm6 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - vaddpd (CO1, LDC), %xmm6, %xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - - addq $ 2*SIZE, CO1 -.endm - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x2 - - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - -.endm - - -.macro KERNEL1x2_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vmovsd -11 * SIZE(BO), %xmm2 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - vfmadd231sd %xmm0 ,%xmm2 , %xmm5 - addq $ 2*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x2 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - vmulsd %xmm0 , %xmm5 , %xmm5 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - vaddsd (CO1, LDC), %xmm5, %xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - addq $ 1*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT4x1 - - vxorpd %ymm4 , %ymm4 , %ymm4 - vxorpd %ymm5 , %ymm5 , %ymm5 - vxorpd %ymm6 , %ymm6 , %ymm6 - vxorpd %ymm7 , %ymm7 , %ymm7 - -.endm - - -.macro KERNEL4x1 - - vbroadcastsd -12 * SIZE(BO), %ymm0 - vbroadcastsd -11 * SIZE(BO), %ymm1 - vbroadcastsd -10 * SIZE(BO), %ymm2 - vbroadcastsd -9 * SIZE(BO), %ymm3 - - vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 - - vbroadcastsd -8 * SIZE(BO), %ymm0 - vbroadcastsd -7 * SIZE(BO), %ymm1 - - vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 - - vbroadcastsd -6 * SIZE(BO), %ymm2 - vbroadcastsd -5 * SIZE(BO), %ymm3 - - vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 - vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 - vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 - vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 - - addq $ 8 *SIZE, BO - addq $ 32*SIZE, AO - -.endm - - -.macro KERNEL4x1_SUB - vbroadcastsd -12 * SIZE(BO), %ymm2 - vmovups -16 * SIZE(AO), %ymm0 - vfmadd231pd %ymm0 ,%ymm2 , %ymm4 - addq $ 1*SIZE, BO - addq $ 4*SIZE, AO - -.endm - - -.macro SAVE4x1 - - vbroadcastsd ALPHA, %ymm0 - - vaddpd %ymm4,%ymm5, %ymm4 - vaddpd %ymm6,%ymm7, %ymm6 - vaddpd %ymm4,%ymm6, %ymm4 - - vmulpd %ymm0 , %ymm4 , %ymm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %ymm4, %ymm4 - -#endif - - vmovups %ymm4 , (CO1) - - addq $ 4*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT2x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL2x1_SUB - vmovddup -12 * SIZE(BO), %xmm2 - vmovups -16 * SIZE(AO), %xmm0 - vfmadd231pd %xmm0 ,%xmm2 , %xmm4 - addq $ 1*SIZE, BO - addq $ 2*SIZE, AO - -.endm - - -.macro SAVE2x1 - - vmovddup ALPHA, %xmm0 - - vmulpd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddpd (CO1) , %xmm4, %xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - addq $ 2*SIZE, CO1 -.endm - - -/******************************************************************************************/ -/******************************************************************************************/ - -.macro INIT1x1 - - vxorpd %xmm4 , %xmm4 , %xmm4 - -.endm - - -.macro KERNEL1x1_SUB - vmovsd -12 * SIZE(BO), %xmm1 - vmovsd -16 * SIZE(AO), %xmm0 - vfmadd231sd %xmm0 ,%xmm1 , %xmm4 - addq $ 1*SIZE, BO - addq $ 1*SIZE, AO - -.endm - - -.macro SAVE1x1 - - vmovsd ALPHA, %xmm0 - - vmulsd %xmm0 , %xmm4 , %xmm4 - - -#if !defined(TRMMKERNEL) - - vaddsd (CO1), %xmm4, %xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - - addq $ 1*SIZE, CO1 -.endm - - -.macro PREFETCHT0_C - prefetcht0 (CO1) - prefetcht0 24(CO1) - prefetcht0 (CO1,LDC,4) - prefetcht0 24(CO1,LDC,4) - prefetcht0 (CO1,LDC,8) - prefetcht0 24(CO1,LDC,8) -.endm -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $24, %rdi - divq %rdi // N / 24 - movq %rax, Ndiv12 // N / 24 - movq %rdx, Nmod12 // N % 24 - - - movq Ndiv12, J - cmpq $ 0, J - je .L8_0 - ALIGN_4 - -.L12_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values from BO1 - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - movq BO2 , B - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - -.L12_02b: - - vmovups 0 * SIZE(BO1), %ymm1 - vmovups 4 * SIZE(BO1), %ymm2 - vmovups 0 * SIZE(BO2), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L12_02b - -.L12_03c: - - -.L12_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L12_20 - - ALIGN_4 - -.L12_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L12_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L12_12a - - ALIGN_5 -.L12_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L12_12 - -.L12_12a: - prefetcht0 ALPHA - PREFETCHT0_C - addq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - leaq (CO1,LDC,2),CO1 - KERNEL4x12_M2 - PREFETCHT0_C - subq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - subq LDC,CO1 - subq LDC,CO1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_13: - - test $1, %rax - jz .L12_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L12_16 - - -.L12_14: - - INIT4x12 - - -.L12_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_19 - - ALIGN_4 - -.L12_17: - - KERNEL4x12_SUB - - dec %rax - jne .L12_17 - ALIGN_4 - - -.L12_19: - - SAVE4x12 - - /* here for the prefetch of next b source block */ - /* the increment should be proportional to GEMM_Q/GEMM_P */ - - salq $3, K -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - prefetcht2 32(B) - prefetcht2 32(B, K, 8) - addq $64, B /* increment */ -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - prefetcht2 32(B) - prefetcht2 32(B, K, 8) - prefetcht2 96(B) - prefetcht2 96(B, K, 8) - addq $128, B /* increment */ -#endif - sarq $3, K - - decq I # i -- - jne .L12_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ - - /* recover the original value of pointer B after prefetch */ - movq M, I - sarq $2, I -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - salq $6, I -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - salq $7, I -#endif - subq I, B - -.L12_20: - // Test rest of M - - testq $3, M - jz .L12_100 // to next 16 lines of N - - -.L12_30: - testq $2, M - jz .L12_40 - - ALIGN_4 - -.L12_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L12_36 - ALIGN_4 - -.L12_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L12_32 - ALIGN_4 - -.L12_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_39 - - ALIGN_4 - -.L12_37: - - KERNEL2x12_SUB - - dec %rax - jne .L12_37 - ALIGN_4 - - -.L12_39: - - SAVE2x12 - - ALIGN_4 - -.L12_40: - testq $1, M - jz .L12_100 // to next 3 lines of N - - ALIGN_4 - -.L12_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L12_46 - - ALIGN_4 - -.L12_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L12_42 - ALIGN_4 - -.L12_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L12_49 - - ALIGN_4 - -.L12_47: - - KERNEL1x12_SUB - - dec %rax - jne .L12_47 - ALIGN_4 - - -.L12_49: - - SAVE1x12 - - ALIGN_4 - -.L12_100: - - - -/**************************************************************************************************/ - -.L13_01: - // copy to sub buffer - movq K, %rax - salq $3,%rax // K * 8 ; read 8 values - movq B, BO2 - leaq (B,%rax, SIZE), BO3 // next offset to BO2 - leaq (BO3,%rax, SIZE), B // next offset to B - - - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - - ALIGN_4 - - -.L13_02b: - - vmovups 4 * SIZE(BO2), %ymm1 - vmovups 0 * SIZE(BO3), %ymm2 - vmovups 4 * SIZE(BO3), %ymm3 - vmovups %ymm1, 0 * SIZE(BO) - vmovups %ymm2, 4 * SIZE(BO) - vmovups %ymm3, 8 * SIZE(BO) - addq $ 8*SIZE,BO2 - addq $ 8*SIZE,BO3 - addq $ 12*SIZE,BO - decq %rax - jnz .L13_02b - - - -.L13_10: - movq C, CO1 - leaq (C, LDC, 8), C - leaq (C, LDC, 4), C // c += 12 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L13_20 - - ALIGN_4 - -.L13_11: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - - jl .L13_13 - - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - subq $2, %rax - je .L13_12a - - ALIGN_5 -.L13_12: - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - dec %rax - jne .L13_12 - -.L13_12a: - prefetcht0 ALPHA - PREFETCHT0_C - addq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - leaq (CO1,LDC,2),CO1 - KERNEL4x12_M2 - PREFETCHT0_C - subq LDC,CO1 - KERNEL4x12_M1 - PREFETCHT0_C - subq LDC,CO1 - subq LDC,CO1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - -.L13_13: - - test $1, %rax - jz .L13_14 - - KERNEL4x12_I - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_M2 - - KERNEL4x12_M1 - KERNEL4x12_M2 - KERNEL4x12_M1 - KERNEL4x12_E - - jmp .L13_16 - - -.L13_14: - - INIT4x12 - - -.L13_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_19 - - ALIGN_4 - -.L13_17: - - KERNEL4x12_SUB - - dec %rax - jne .L13_17 - ALIGN_4 - - -.L13_19: - - SAVE4x12 - - /* here for the prefetch of next b source block */ - /* the increment should be proportional to GEMM_Q/GEMM_P */ - - salq $3, K -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - prefetcht2 (B) - prefetcht2 (B, K, 8) - addq $64, B /* increment */ -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - prefetcht2 (B) - prefetcht2 (B, K, 8) - prefetcht2 64(B) - prefetcht2 64(B, K, 8) - addq $128, B /* increment */ -#endif - sarq $3, K - - decq I # i -- - jne .L13_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ - /* recover the original value of pointer B */ - movq M, I - sarq $2, I -#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ - salq $6, I -#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ - salq $7, I -#endif - subq I, B - -.L13_20: - // Test rest of M - - testq $3, M - jz .L13_100 // to next 16 lines of N - - -.L13_30: - testq $2, M - jz .L13_40 - - ALIGN_4 - -.L13_31: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x12 - - movq K, %rax - - sarq $3, %rax - je .L13_36 - ALIGN_4 - -.L13_32: - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - KERNEL2x12_SUB - - dec %rax - jne .L13_32 - ALIGN_4 - -.L13_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_39 - - ALIGN_4 - -.L13_37: - - KERNEL2x12_SUB - - dec %rax - jne .L13_37 - ALIGN_4 - - -.L13_39: - - SAVE2x12 - - ALIGN_4 - -.L13_40: - testq $1, M - jz .L13_100 // to next 3 lines of N - - ALIGN_4 - -.L13_41: - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x12 - - movq K, %rax - - sarq $3,%rax - je .L13_46 - - ALIGN_4 - -.L13_42: - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - KERNEL1x12_SUB - - - dec %rax - jne .L13_42 - ALIGN_4 - -.L13_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L13_49 - - ALIGN_4 - -.L13_47: - - KERNEL1x12_SUB - - dec %rax - jne .L13_47 - ALIGN_4 - - -.L13_49: - - SAVE1x12 - - ALIGN_4 - -.L13_100: - - decq J // j -- - jg .L12_01 - - - - -/**************************************************************************************************/ - -.L8_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - sarq $3, J // j = j / 8 - je .L4_0 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x8 - - movq K, %rax - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x8 - - movq K, %rax - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - - ALIGN_4 - -.L8_100: - - movq K, %rax - salq $3, %rax // * 8 - leaq (B , %rax, SIZE), B - decq J // j -- - jg .L8_10 - - - -/**************************************************************************************************/ - -.L4_0: - - cmpq $ 0, Nmod12 // N % 12 == 0 - je .L999 - - movq Nmod12, J - testq $4, J // j = j / 4 - je .L2_0 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - movq B, BO - addq $12 * SIZE, BO - - movq K, %rax - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - - decq I # i -- - jg .L4_11 - - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x4 - - movq K, %rax - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x4 - - movq K, %rax - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - - ALIGN_4 - -.L4_100: - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x2 - - movq K, %rax - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x2 - - movq K, %rax - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x2 - - movq K, %rax - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -.L2_100: - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - movq B, BO - addq $12 * SIZE, BO - - INIT4x1 - - movq K, %rax - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT2x1 - - movq K, %rax - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - movq B, BO // first buffer to BO - addq $12 * SIZE, BO - - INIT1x1 - - movq K, %rax - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -.L1_100: - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovups %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $8, %rdi - divq %rdi // N / 8 - movq %rax, Ndiv12 // N / 8 - movq %rdx, Nmod12 // N % 8 - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -/*************************************************************************************************/ -.L8_0: - movq Ndiv12, J - cmpq $ 0, J - je .L4_0 - ALIGN_4 - -.L8_10: - movq C, CO1 - leaq (C, LDC, 8), C // c += 8 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L8_20 - - ALIGN_4 - -.L8_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L8_13 - - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - subq $2, %rax - je .L8_12a - - ALIGN_5 - -.L8_12: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - dec %rax - jne .L8_12 - -.L8_12a: - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_13: - - test $1, %rax - jz .L8_14 - - KERNEL4x8_I - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_M2 - - KERNEL4x8_M1 - KERNEL4x8_M2 - KERNEL4x8_M1 - KERNEL4x8_E - - jmp .L8_16 - - -.L8_14: - - INIT4x8 - - -.L8_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_19 - - ALIGN_4 - -.L8_17: - - KERNEL4x8_SUB - - dec %rax - jne .L8_17 - ALIGN_4 - - -.L8_19: - - SAVE4x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L8_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L8_20: - // Test rest of M - - testq $3, M - jz .L8_100 // to next 16 lines of N - - -.L8_30: - testq $2, M - jz .L8_40 - - ALIGN_4 - -.L8_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x8 - - sarq $3, %rax - je .L8_36 - ALIGN_4 - -.L8_32: - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - KERNEL2x8_SUB - - dec %rax - jne .L8_32 - ALIGN_4 - -.L8_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_39 - - ALIGN_4 - -.L8_37: - - KERNEL2x8_SUB - - dec %rax - jne .L8_37 - - -.L8_39: - - SAVE2x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L8_40: - testq $1, M - jz .L8_100 // to next 3 lines of N - - ALIGN_4 - -.L8_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,8), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $8, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x8 - - sarq $3,%rax - je .L8_46 - - ALIGN_4 - -.L8_42: - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - KERNEL1x8_SUB - - dec %rax - jne .L8_42 - ALIGN_4 - -.L8_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L8_49 - - ALIGN_4 - -.L8_47: - - KERNEL1x8_SUB - - dec %rax - jne .L8_47 - ALIGN_4 - - -.L8_49: - - SAVE1x8 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 8), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L8_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $8, KK // number of values in B -#endif - - - decq J // j -- - jg .L8_10 - - - - - -/*************************************************************************************************/ -.L4_0: - movq Nmod12, J - testq $4, J - je .L2_0 - ALIGN_4 - -.L4_10: - movq C, CO1 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L4_20 - - ALIGN_4 - -.L4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - sarq $3, %rax // K / 8 - cmpq $2, %rax - jl .L4_13 - - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - subq $2, %rax - je .L4_12a - - ALIGN_5 - -.L4_12: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - dec %rax - jne .L4_12 - -.L4_12a: - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_13: - - test $1, %rax - jz .L4_14 - - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E - - jmp .L4_16 - - -.L4_14: - - INIT4x4 - - -.L4_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_19 - - ALIGN_4 - -.L4_17: - - KERNEL4x4_SUB - - dec %rax - jne .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $3, M - jz .L4_100 // to next 16 lines of N - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x4 - - sarq $3, %rax - je .L4_36 - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - dec %rax - jne .L4_32 - ALIGN_4 - -.L4_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_39 - - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - dec %rax - jne .L4_37 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L4_40: - testq $1, M - jz .L4_100 // to next 3 lines of N - - ALIGN_4 - -.L4_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,4), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x4 - - sarq $3,%rax - je .L4_46 - - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - dec %rax - jne .L4_42 - ALIGN_4 - -.L4_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L4_49 - - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - dec %rax - jne .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 4), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - -.L4_100: - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK // number of values in B -#endif - - - movq K, %rax - salq $2, %rax // * 4 - leaq (B , %rax, SIZE), B - - - - -/***************************************************************************************************************/ - -.L2_0: - - movq Nmod12, J - testq $2, J - je .L1_0 - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x2 - - sarq $3, %rax // K / 8 - - je .L2_16 - - ALIGN_5 - -.L2_12: - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - dec %rax - jne .L2_12 - - -.L2_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - ALIGN_4 - -.L2_17: - - KERNEL4x2_SUB - - dec %rax - jne .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $3, M - jz .L2_100 // to next 16 lines of N - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x2 - - sarq $3, %rax - je .L2_36 - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - dec %rax - jne .L2_32 - -.L2_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - dec %rax - jne .L2_37 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax + SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L2_40: - testq $1, M - jz .L2_100 // to next 3 lines of N - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,2), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x2 - - sarq $3,%rax - je .L2_46 - - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - dec %rax - jne .L2_42 - -.L2_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - dec %rax - jne .L2_47 - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 2), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - -.L2_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK // number of values in B -#endif - - movq K, %rax - salq $1, %rax // * 2 - leaq (B , %rax, SIZE), B - -/***************************************************************************************************************/ - -.L1_0: - - movq Nmod12, J - testq $1, J - je .L999 - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $2, I // i = m / 4 - je .L1_20 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,4), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT4x1 - - sarq $3, %rax // K / 8 - je .L1_16 - - ALIGN_5 - -.L1_12: - - KERNEL4x1 - - dec %rax - jne .L1_12 - - -.L1_16: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - ALIGN_4 - -.L1_17: - - KERNEL4x1_SUB - - dec %rax - jne .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 4), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK // number of values in A -#endif - - - decq I # i -- - jg .L1_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $3, M - jz .L1_100 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,2), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT2x1 - - sarq $3, %rax - je .L1_36 - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - - dec %rax - jne .L1_32 - -.L1_36: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - dec %rax - jne .L1_37 - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 2), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK // number of values in A -#endif - - -.L1_40: - testq $1, M - jz .L1_100 // to next 3 lines of N - - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq B, BO - addq $12 * SIZE, BO -#else - movq B, BO - addq $12 * SIZE, BO - movq KK, %rax - salq $3, %rax // rax * SIZE - leaq (BO,%rax,1), BO // add number of values in B - leaq (AO,%rax,1), AO // add number of values in A -#endif - - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - INIT1x1 - - sarq $3,%rax - je .L1_46 - - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - dec %rax - jne .L1_42 - -.L1_46: - movq KKK, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - dec %rax - jne .L1_47 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - salq $3, %rax // rax * SIZE - leaq (BO, %rax, 1), BO // number of values in B - leaq (AO, %rax, 1), AO // number of values in A -#endif - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK // number of values in A -#endif - - - -.L1_100: - - -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $1, KK // number of values in B -#endif - - - -.L999: - - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/********************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 +#define BO3 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 +#define L_BUFFER_SIZE 256*8*12+4096 + +#else + +#define STACKSIZE 256 +#define L_BUFFER_SIZE 128*8*12+512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + + +#define Ndiv12 24(%rsp) +#define Nmod12 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 160 +#define BROADCASTKERNEL + +/******************************************************************************************* +* Macro definitions +*******************************************************************************************/ + +.macro INIT4x12 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + vxorpd %ymm12, %ymm12, %ymm12 + vxorpd %ymm13, %ymm13, %ymm13 + vxorpd %ymm14, %ymm14, %ymm14 + vxorpd %ymm15, %ymm15, %ymm15 + +.endm + +.macro KERNEL4x12_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 + prefetcht0 B_PR1(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + prefetcht0 B_PR1+64(BO) + vmovups -8 * SIZE(BO), %ymm2 + prefetcht0 B_PR1+128(BO) + vmovups -4 * SIZE(BO), %ymm3 + vmulpd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+192(BO) + vmulpd %ymm0 ,%ymm2 , %ymm8 + vmulpd %ymm0 ,%ymm3 , %ymm12 + prefetcht0 B_PR1+256(BO) +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 + vmulpd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 12*SIZE, BO + vmulpd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M1 + prefetcht0 A_PR1(AO) +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + prefetcht0 B_PR1+128(BO) + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups -4 * SIZE(BO), %ymm3 + +.endm + +.macro KERNEL4x12_M2 +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else + vmovups -12 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups 0 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 4 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + vmovups 8 * SIZE(BO), %ymm3 + addq $ 24*SIZE, BO +.endm + + +.macro KERNEL4x12_E +# if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +# else + vmovups -12 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + addq $ 12*SIZE, BO +.endm + +.macro KERNEL4x12_SUB + vmovups -12 * SIZE(BO), %ymm1 +# if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +# else + vmovups -16 * SIZE(AO), %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 + vmovups -4 * SIZE(BO), %ymm3 + vfmadd231pd %ymm0 ,%ymm3 , %ymm12 +# if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 12*SIZE, BO + vfmadd231pd %ymm0 ,%ymm3 , %ymm13 +# if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +# else + vpermpd $ 0x1b, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO + vfmadd231pd %ymm0 ,%ymm3 , %ymm14 +# if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +# else + vpermilpd $ 0x05, %ymm0 , %ymm0 +# endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vfmadd231pd %ymm0 ,%ymm3 , %ymm15 + +.endm + + +.macro SAVE4x12 + + prefetcht0 BUFFER1 + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + prefetcht0 64 + BUFFER1 + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 +#if B_PR1 > 32 + prefetcht0 128 + BUFFER1 +#endif + vmulpd %ymm0 , %ymm12, %ymm12 + vmulpd %ymm0 , %ymm13, %ymm13 + vmulpd %ymm0 , %ymm14, %ymm14 + vmulpd %ymm0 , %ymm15, %ymm15 +#if B_PR1 > 96 + prefetcht0 192 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 +#endif + +#if B_PR1 > 160 + prefetcht0 256 + BUFFER1 +#endif + +#if defined BROADCASTKERNEL + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 +#endif + +#if B_PR1 > 224 + prefetcht0 320 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 +#endif + +#if B_PR1 > 288 + prefetcht0 384 + BUFFER1 +#endif + +#ifndef BROADCASTKERNEL + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + +#if B_PR1 > 352 + prefetcht0 448 + BUFFER1 +#endif + leaq (CO1, LDC, 2), %rax + +#if B_PR1 > 416 + prefetcht0 512 + BUFFER1 +#endif + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht1 56(CO1) + prefetcht1 56(CO1,LDC) + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9, %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0 + vblendpd $ 0x05, %ymm9, %ymm8, %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0 + vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1 + vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2 + vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm13, %ymm13 + vpermilpd $ 0x05 , %ymm15, %ymm15 + + vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 + vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 + vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 + vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht1 56(%rax) + prefetcht1 56(%rax,LDC) + prefetcht1 56(%rbp) + prefetcht1 56(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL2x12_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vmovddup -4 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vmovddup -3 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + vmovddup -2 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm12 + vmovddup -1 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231pd %xmm0 ,%xmm2 , %xmm14 + addq $ 2*SIZE, AO + vfmadd231pd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE2x12 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + vmulpd %xmm0 , %xmm12, %xmm12 + vmulpd %xmm0 , %xmm13, %xmm13 + vmulpd %xmm0 , %xmm14, %xmm14 + vmulpd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm12, %xmm4 + vaddpd (%rax, LDC), %xmm13, %xmm5 + vaddpd (%rbp), %xmm14, %xmm6 + vaddpd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x12 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 + +.endm + +.macro KERNEL1x12_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vmovsd -4 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vmovsd -3 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + vmovsd -2 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm12 + vmovsd -1 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm13 + addq $ 12*SIZE, BO + vfmadd231sd %xmm0 ,%xmm2 , %xmm14 + addq $ 1*SIZE, AO + vfmadd231sd %xmm0 ,%xmm3 , %xmm15 + +.endm + +.macro SAVE1x12 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + vmulsd %xmm0 , %xmm12, %xmm12 + vmulsd %xmm0 , %xmm13, %xmm13 + vmulsd %xmm0 , %xmm14, %xmm14 + vmulsd %xmm0 , %xmm15, %xmm15 + + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + + leaq (%rax, LDC, 4), %rax + leaq (%rbp, LDC, 4), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm12, %xmm4 + vaddsd (%rax, LDC), %xmm13, %xmm5 + vaddsd (%rbp), %xmm14, %xmm6 + vaddsd (%rbp, LDC), %xmm15, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + +/******************************************************************************************/ + + +.macro INIT4x8 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + vxorpd %ymm8 , %ymm8 , %ymm8 + vxorpd %ymm9 , %ymm9 , %ymm9 + vxorpd %ymm10, %ymm10, %ymm10 + vxorpd %ymm11, %ymm11, %ymm11 + +.endm + +.macro KERNEL4x8_I + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vmovups -8 * SIZE(BO), %ymm2 + vmulpd %ymm0 ,%ymm1 , %ymm4 + vmulpd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm5 + vmulpd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + vmulpd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vmulpd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M1 + prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + prefetcht0 B_PR1(BO) + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + prefetcht0 B_PR1+64(BO) + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups -8 * SIZE(BO), %ymm2 + +.endm + +.macro KERNEL4x8_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -4 * SIZE(BO), %ymm1 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + vmovups 0 * SIZE(BO), %ymm2 + addq $ 16*SIZE, BO +.endm + + +.macro KERNEL4x8_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + addq $ 8*SIZE, BO +.endm + +.macro KERNEL4x8_SUB + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 + vmovups -8 * SIZE(BO), %ymm2 + vfmadd231pd %ymm0 ,%ymm2 , %ymm8 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + vfmadd231pd %ymm0 ,%ymm2 , %ymm9 + addq $ 8*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + vfmadd231pd %ymm0 ,%ymm2 , %ymm10 + addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vfmadd231pd %ymm0 ,%ymm2 , %ymm11 + +.endm + + +.macro SAVE4x8 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + vmulpd %ymm0 , %ymm7 , %ymm7 + + vmulpd %ymm0 , %ymm8 , %ymm8 + vmulpd %ymm0 , %ymm9 , %ymm9 + vmulpd %ymm0 , %ymm10, %ymm10 + vmulpd %ymm0 , %ymm11, %ymm11 + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + prefetcht0 56(CO1) + prefetcht0 56(CO1,LDC) + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0 + vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1 + vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2 + vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm9 , %ymm9 + vpermilpd $ 0x05 , %ymm11, %ymm11 + + vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 + vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 + vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 + vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %ymm4, %ymm4 + vaddpd (%rax, LDC), %ymm5, %ymm5 + vaddpd (%rbp), %ymm6, %ymm6 + vaddpd (%rbp, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (%rax) + vmovups %ymm5 , (%rax, LDC) + vmovups %ymm6 , (%rbp) + vmovups %ymm7 , (%rbp, LDC) + + prefetcht0 56(%rax) + prefetcht0 56(%rax,LDC) + prefetcht0 56(%rbp) + prefetcht0 56(%rbp,LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ + +.macro INIT2x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL2x8_SUB + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -12 * SIZE(BO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm2 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -9 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -8 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vmovddup -7 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm1 , %xmm7 + vmovddup -6 * SIZE(BO), %xmm1 + vfmadd231pd %xmm0 ,%xmm2 , %xmm8 + vmovddup -5 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm3 , %xmm9 + vfmadd231pd %xmm0 ,%xmm1 , %xmm10 + vfmadd231pd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 2*SIZE, AO + +.endm + +.macro SAVE2x8 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + vmulpd %xmm0 , %xmm8 , %xmm8 + vmulpd %xmm0 , %xmm9 , %xmm9 + vmulpd %xmm0 , %xmm10, %xmm10 + vmulpd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddpd (%rax), %xmm8 , %xmm4 + vaddpd (%rax, LDC), %xmm9 , %xmm5 + vaddpd (%rbp), %xmm10, %xmm6 + vaddpd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovups %xmm4 , (%rax) + vmovups %xmm5 , (%rax, LDC) + vmovups %xmm6 , (%rbp) + vmovups %xmm7 , (%rbp, LDC) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ + +.macro INIT1x8 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + +.endm + +.macro KERNEL1x8_SUB + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -11 * SIZE(BO), %xmm2 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -9 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -8 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + vmovsd -7 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm1 , %xmm7 + vmovsd -6 * SIZE(BO), %xmm1 + vfmadd231sd %xmm0 ,%xmm2 , %xmm8 + vmovsd -5 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm3 , %xmm9 + vfmadd231sd %xmm0 ,%xmm1 , %xmm10 + vfmadd231sd %xmm0 ,%xmm2 , %xmm11 + addq $ 8*SIZE, BO + addq $ 1*SIZE, AO + +.endm + +.macro SAVE1x8 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + vmulsd %xmm0 , %xmm8 , %xmm8 + vmulsd %xmm0 , %xmm9 , %xmm9 + vmulsd %xmm0 , %xmm10, %xmm10 + vmulsd %xmm0 , %xmm11, %xmm11 + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + + leaq (%rax, LDC, 2), %rax + leaq (%rax, LDC, 2), %rbp + +#if !defined(TRMMKERNEL) + + vaddsd (%rax), %xmm8 , %xmm4 + vaddsd (%rax, LDC), %xmm9 , %xmm5 + vaddsd (%rbp), %xmm10, %xmm6 + vaddsd (%rbp, LDC), %xmm11, %xmm7 + +#endif + + vmovsd %xmm4 , (%rax) + vmovsd %xmm5 , (%rax, LDC) + vmovsd %xmm6 , (%rbp) + vmovsd %xmm7 , (%rbp, LDC) + + addq $ 1*SIZE, CO1 +.endm + + + + + +/******************************************************************************************/ + +.macro INIT4x4 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + +.macro KERNEL4x4_I + prefetcht0 A_PR1(AO) + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm6 + + addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vmulpd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M1 + prefetcht0 A_PR1(AO) +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 +#if defined BROADCASTKERNEL + vbroadcastsd -13 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -12 * SIZE(BO), %ymm1 + +.endm + +.macro KERNEL4x4_M2 +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + vmovups -8 * SIZE(BO), %ymm1 + addq $ 8*SIZE, BO +.endm + + +.macro KERNEL4x4_E +#if defined BROADCASTKERNEL + vbroadcastsd -12 * SIZE(AO), %ymm0 +#else + vmovups -12 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -11 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 +#if defined BROADCASTKERNEL + vbroadcastsd -10 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + + addq $ 8*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + addq $ 4*SIZE, BO +.endm + +.macro KERNEL4x4_SUB + vmovups -12 * SIZE(BO), %ymm1 +#if defined BROADCASTKERNEL + vbroadcastsd -16 * SIZE(AO), %ymm0 +#else + vmovups -16 * SIZE(AO), %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm4 +#if defined BROADCASTKERNEL + vbroadcastsd -15 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm5 + addq $ 4*SIZE, BO +#if defined BROADCASTKERNEL + vbroadcastsd -14 * SIZE(AO), %ymm0 +#else + vpermpd $ 0x1b, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm6 + addq $ 4*SIZE, AO +#if defined BROADCASTKERNEL + vbroadcastsd -17 * SIZE(AO), %ymm0 +#else + vpermilpd $ 0x05, %ymm0 , %ymm0 +#endif + vfmadd231pd %ymm0 ,%ymm1 , %ymm7 + +.endm + +.macro SAVE4x4 + + vbroadcastsd ALPHA, %ymm0 + + vmulpd %ymm0 , %ymm4 , %ymm4 + vmulpd %ymm0 , %ymm7 , %ymm7 + vmulpd %ymm0 , %ymm5 , %ymm5 + vmulpd %ymm0 , %ymm6 , %ymm6 + +#if defined BROADCASTKERNEL + vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0 + vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1 + vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2 + vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3 + vunpcklpd %ymm1, %ymm0, %ymm4 + vunpckhpd %ymm1, %ymm0, %ymm5 + vunpcklpd %ymm3, %ymm2, %ymm6 + vunpckhpd %ymm3, %ymm2, %ymm7 +#else + vpermilpd $ 0x05 , %ymm5, %ymm5 + vpermilpd $ 0x05 , %ymm7, %ymm7 + + vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 + vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 + vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 + vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 + + vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 + vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 + + vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 + vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 + vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 + vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 +#endif + + leaq (CO1, LDC, 2), %rax + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %ymm4, %ymm4 + vaddpd (CO1, LDC), %ymm5, %ymm5 + vaddpd (%rax), %ymm6, %ymm6 + vaddpd (%rax, LDC), %ymm7, %ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , (CO1, LDC) + vmovups %ymm6 , (%rax) + vmovups %ymm7 , (%rax, LDC) + + addq $ 4*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL2x4_SUB + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm2 + vfmadd231pd %xmm0 ,%xmm1 , %xmm4 + vmovddup -10 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm5 + vmovddup -9 * SIZE(BO), %xmm8 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231pd %xmm0 ,%xmm8 , %xmm7 + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x4 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddpd (CO1), %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm5, %xmm5 + vaddpd (%rax), %xmm6, %xmm6 + vaddpd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (%rax) + vmovups %xmm7 , (%rax, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x4 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL1x4_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vmovsd -10 * SIZE(BO), %xmm3 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + vmovsd -9 * SIZE(BO), %xmm8 + vfmadd231sd %xmm0 ,%xmm3 , %xmm6 + addq $ 4*SIZE, BO + vfmadd231sd %xmm0 ,%xmm8 , %xmm7 + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x4 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + vmulsd %xmm0 , %xmm6 , %xmm6 + vmulsd %xmm0 , %xmm7 , %xmm7 + + leaq (CO1, LDC, 2), %rax + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + vaddsd (%rax), %xmm6, %xmm6 + vaddsd (%rax, LDC), %xmm7, %xmm7 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (%rax) + vmovsd %xmm7 , (%rax, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + +.endm + + +.macro KERNEL4x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -14 * SIZE(AO), %xmm1 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm1 ,%xmm2 , %xmm5 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + vfmadd231pd %xmm1 ,%xmm3 , %xmm7 + addq $ 2*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm5 , %xmm5 + vmulpd %xmm0 , %xmm6 , %xmm6 + vmulpd %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 + vaddpd (CO1, LDC), %xmm6, %xmm6 + vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , 2 * SIZE(CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm7 , 2 * SIZE(CO1, LDC) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm6 , %xmm6 , %xmm6 + +.endm + + +.macro KERNEL2x2_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -11 * SIZE(BO), %xmm3 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + vfmadd231pd %xmm0 ,%xmm3 , %xmm6 + addq $ 2*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x2 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + vmulpd %xmm0 , %xmm6 , %xmm6 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + vaddpd (CO1, LDC), %xmm6, %xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + + addq $ 2*SIZE, CO1 +.endm + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x2 + + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + +.endm + + +.macro KERNEL1x2_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vmovsd -11 * SIZE(BO), %xmm2 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + vfmadd231sd %xmm0 ,%xmm2 , %xmm5 + addq $ 2*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x2 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + vmulsd %xmm0 , %xmm5 , %xmm5 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + vaddsd (CO1, LDC), %xmm5, %xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + addq $ 1*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT4x1 + + vxorpd %ymm4 , %ymm4 , %ymm4 + vxorpd %ymm5 , %ymm5 , %ymm5 + vxorpd %ymm6 , %ymm6 , %ymm6 + vxorpd %ymm7 , %ymm7 , %ymm7 + +.endm + + +.macro KERNEL4x1 + + vbroadcastsd -12 * SIZE(BO), %ymm0 + vbroadcastsd -11 * SIZE(BO), %ymm1 + vbroadcastsd -10 * SIZE(BO), %ymm2 + vbroadcastsd -9 * SIZE(BO), %ymm3 + + vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 + + vbroadcastsd -8 * SIZE(BO), %ymm0 + vbroadcastsd -7 * SIZE(BO), %ymm1 + + vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 + + vbroadcastsd -6 * SIZE(BO), %ymm2 + vbroadcastsd -5 * SIZE(BO), %ymm3 + + vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 + vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 + vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 + vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 + + addq $ 8 *SIZE, BO + addq $ 32*SIZE, AO + +.endm + + +.macro KERNEL4x1_SUB + vbroadcastsd -12 * SIZE(BO), %ymm2 + vmovups -16 * SIZE(AO), %ymm0 + vfmadd231pd %ymm0 ,%ymm2 , %ymm4 + addq $ 1*SIZE, BO + addq $ 4*SIZE, AO + +.endm + + +.macro SAVE4x1 + + vbroadcastsd ALPHA, %ymm0 + + vaddpd %ymm4,%ymm5, %ymm4 + vaddpd %ymm6,%ymm7, %ymm6 + vaddpd %ymm4,%ymm6, %ymm4 + + vmulpd %ymm0 , %ymm4 , %ymm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %ymm4, %ymm4 + +#endif + + vmovups %ymm4 , (CO1) + + addq $ 4*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT2x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL2x1_SUB + vmovddup -12 * SIZE(BO), %xmm2 + vmovups -16 * SIZE(AO), %xmm0 + vfmadd231pd %xmm0 ,%xmm2 , %xmm4 + addq $ 1*SIZE, BO + addq $ 2*SIZE, AO + +.endm + + +.macro SAVE2x1 + + vmovddup ALPHA, %xmm0 + + vmulpd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddpd (CO1) , %xmm4, %xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + addq $ 2*SIZE, CO1 +.endm + + +/******************************************************************************************/ +/******************************************************************************************/ + +.macro INIT1x1 + + vxorpd %xmm4 , %xmm4 , %xmm4 + +.endm + + +.macro KERNEL1x1_SUB + vmovsd -12 * SIZE(BO), %xmm1 + vmovsd -16 * SIZE(AO), %xmm0 + vfmadd231sd %xmm0 ,%xmm1 , %xmm4 + addq $ 1*SIZE, BO + addq $ 1*SIZE, AO + +.endm + + +.macro SAVE1x1 + + vmovsd ALPHA, %xmm0 + + vmulsd %xmm0 , %xmm4 , %xmm4 + + +#if !defined(TRMMKERNEL) + + vaddsd (CO1), %xmm4, %xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + + addq $ 1*SIZE, CO1 +.endm + + +.macro PREFETCHT0_C + prefetcht0 (CO1) + prefetcht0 24(CO1) + prefetcht0 (CO1,LDC,4) + prefetcht0 24(CO1,LDC,4) + prefetcht0 (CO1,LDC,8) + prefetcht0 24(CO1,LDC,8) +.endm +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $24, %rdi + divq %rdi // N / 24 + movq %rax, Ndiv12 // N / 24 + movq %rdx, Nmod12 // N % 24 + + + movq Ndiv12, J + cmpq $ 0, J + je .L8_0 + ALIGN_4 + +.L12_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values from BO1 + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + movq BO2 , B + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + +.L12_02b: + + vmovups 0 * SIZE(BO1), %ymm1 + vmovups 4 * SIZE(BO1), %ymm2 + vmovups 0 * SIZE(BO2), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L12_02b + +.L12_03c: + + +.L12_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L12_20 + + ALIGN_4 + +.L12_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L12_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L12_12a + + ALIGN_5 +.L12_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L12_12 + +.L12_12a: + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 + KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_13: + + test $1, %rax + jz .L12_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L12_16 + + +.L12_14: + + INIT4x12 + + +.L12_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_19 + + ALIGN_4 + +.L12_17: + + KERNEL4x12_SUB + + dec %rax + jne .L12_17 + ALIGN_4 + + +.L12_19: + + SAVE4x12 + + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 32(B) + prefetcht2 32(B, K, 8) + prefetcht2 96(B) + prefetcht2 96(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + + decq I # i -- + jne .L12_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ + + /* recover the original value of pointer B after prefetch */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + +.L12_20: + // Test rest of M + + testq $3, M + jz .L12_100 // to next 16 lines of N + + +.L12_30: + testq $2, M + jz .L12_40 + + ALIGN_4 + +.L12_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L12_36 + ALIGN_4 + +.L12_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L12_32 + ALIGN_4 + +.L12_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_39 + + ALIGN_4 + +.L12_37: + + KERNEL2x12_SUB + + dec %rax + jne .L12_37 + ALIGN_4 + + +.L12_39: + + SAVE2x12 + + ALIGN_4 + +.L12_40: + testq $1, M + jz .L12_100 // to next 3 lines of N + + ALIGN_4 + +.L12_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L12_46 + + ALIGN_4 + +.L12_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L12_42 + ALIGN_4 + +.L12_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L12_49 + + ALIGN_4 + +.L12_47: + + KERNEL1x12_SUB + + dec %rax + jne .L12_47 + ALIGN_4 + + +.L12_49: + + SAVE1x12 + + ALIGN_4 + +.L12_100: + + + +/**************************************************************************************************/ + +.L13_01: + // copy to sub buffer + movq K, %rax + salq $3,%rax // K * 8 ; read 8 values + movq B, BO2 + leaq (B,%rax, SIZE), BO3 // next offset to BO2 + leaq (BO3,%rax, SIZE), B // next offset to B + + + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + + ALIGN_4 + + +.L13_02b: + + vmovups 4 * SIZE(BO2), %ymm1 + vmovups 0 * SIZE(BO3), %ymm2 + vmovups 4 * SIZE(BO3), %ymm3 + vmovups %ymm1, 0 * SIZE(BO) + vmovups %ymm2, 4 * SIZE(BO) + vmovups %ymm3, 8 * SIZE(BO) + addq $ 8*SIZE,BO2 + addq $ 8*SIZE,BO3 + addq $ 12*SIZE,BO + decq %rax + jnz .L13_02b + + + +.L13_10: + movq C, CO1 + leaq (C, LDC, 8), C + leaq (C, LDC, 4), C // c += 12 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L13_20 + + ALIGN_4 + +.L13_11: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + + jl .L13_13 + + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + subq $2, %rax + je .L13_12a + + ALIGN_5 +.L13_12: + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + dec %rax + jne .L13_12 + +.L13_12a: + prefetcht0 ALPHA + PREFETCHT0_C + addq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + leaq (CO1,LDC,2),CO1 + KERNEL4x12_M2 + PREFETCHT0_C + subq LDC,CO1 + KERNEL4x12_M1 + PREFETCHT0_C + subq LDC,CO1 + subq LDC,CO1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + +.L13_13: + + test $1, %rax + jz .L13_14 + + KERNEL4x12_I + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_M2 + + KERNEL4x12_M1 + KERNEL4x12_M2 + KERNEL4x12_M1 + KERNEL4x12_E + + jmp .L13_16 + + +.L13_14: + + INIT4x12 + + +.L13_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_19 + + ALIGN_4 + +.L13_17: + + KERNEL4x12_SUB + + dec %rax + jne .L13_17 + ALIGN_4 + + +.L13_19: + + SAVE4x12 + + /* here for the prefetch of next b source block */ + /* the increment should be proportional to GEMM_Q/GEMM_P */ + + salq $3, K +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + addq $64, B /* increment */ +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + prefetcht2 (B) + prefetcht2 (B, K, 8) + prefetcht2 64(B) + prefetcht2 64(B, K, 8) + addq $128, B /* increment */ +#endif + sarq $3, K + + decq I # i -- + jne .L13_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ + /* recover the original value of pointer B */ + movq M, I + sarq $2, I +#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ + salq $6, I +#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ + salq $7, I +#endif + subq I, B + +.L13_20: + // Test rest of M + + testq $3, M + jz .L13_100 // to next 16 lines of N + + +.L13_30: + testq $2, M + jz .L13_40 + + ALIGN_4 + +.L13_31: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x12 + + movq K, %rax + + sarq $3, %rax + je .L13_36 + ALIGN_4 + +.L13_32: + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + KERNEL2x12_SUB + + dec %rax + jne .L13_32 + ALIGN_4 + +.L13_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_39 + + ALIGN_4 + +.L13_37: + + KERNEL2x12_SUB + + dec %rax + jne .L13_37 + ALIGN_4 + + +.L13_39: + + SAVE2x12 + + ALIGN_4 + +.L13_40: + testq $1, M + jz .L13_100 // to next 3 lines of N + + ALIGN_4 + +.L13_41: + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x12 + + movq K, %rax + + sarq $3,%rax + je .L13_46 + + ALIGN_4 + +.L13_42: + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + KERNEL1x12_SUB + + + dec %rax + jne .L13_42 + ALIGN_4 + +.L13_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L13_49 + + ALIGN_4 + +.L13_47: + + KERNEL1x12_SUB + + dec %rax + jne .L13_47 + ALIGN_4 + + +.L13_49: + + SAVE1x12 + + ALIGN_4 + +.L13_100: + + decq J // j -- + jg .L12_01 + + + + +/**************************************************************************************************/ + +.L8_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + sarq $3, J // j = j / 8 + je .L4_0 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x8 + + movq K, %rax + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x8 + + movq K, %rax + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + + ALIGN_4 + +.L8_100: + + movq K, %rax + salq $3, %rax // * 8 + leaq (B , %rax, SIZE), B + decq J // j -- + jg .L8_10 + + + +/**************************************************************************************************/ + +.L4_0: + + cmpq $ 0, Nmod12 // N % 12 == 0 + je .L999 + + movq Nmod12, J + testq $4, J // j = j / 4 + je .L2_0 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + movq B, BO + addq $12 * SIZE, BO + + movq K, %rax + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + + decq I # i -- + jg .L4_11 + + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x4 + + movq K, %rax + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x4 + + movq K, %rax + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + + ALIGN_4 + +.L4_100: + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x2 + + movq K, %rax + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x2 + + movq K, %rax + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x2 + + movq K, %rax + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +.L2_100: + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + movq B, BO + addq $12 * SIZE, BO + + INIT4x1 + + movq K, %rax + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT2x1 + + movq K, %rax + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + movq B, BO // first buffer to BO + addq $12 * SIZE, BO + + INIT1x1 + + movq K, %rax + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +.L1_100: + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovups %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $8, %rdi + divq %rdi // N / 8 + movq %rax, Ndiv12 // N / 8 + movq %rdx, Nmod12 // N % 8 + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +/*************************************************************************************************/ +.L8_0: + movq Ndiv12, J + cmpq $ 0, J + je .L4_0 + ALIGN_4 + +.L8_10: + movq C, CO1 + leaq (C, LDC, 8), C // c += 8 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L8_20 + + ALIGN_4 + +.L8_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L8_13 + + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + subq $2, %rax + je .L8_12a + + ALIGN_5 + +.L8_12: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + dec %rax + jne .L8_12 + +.L8_12a: + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_13: + + test $1, %rax + jz .L8_14 + + KERNEL4x8_I + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_M2 + + KERNEL4x8_M1 + KERNEL4x8_M2 + KERNEL4x8_M1 + KERNEL4x8_E + + jmp .L8_16 + + +.L8_14: + + INIT4x8 + + +.L8_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_19 + + ALIGN_4 + +.L8_17: + + KERNEL4x8_SUB + + dec %rax + jne .L8_17 + ALIGN_4 + + +.L8_19: + + SAVE4x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L8_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L8_20: + // Test rest of M + + testq $3, M + jz .L8_100 // to next 16 lines of N + + +.L8_30: + testq $2, M + jz .L8_40 + + ALIGN_4 + +.L8_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x8 + + sarq $3, %rax + je .L8_36 + ALIGN_4 + +.L8_32: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + dec %rax + jne .L8_32 + ALIGN_4 + +.L8_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_39 + + ALIGN_4 + +.L8_37: + + KERNEL2x8_SUB + + dec %rax + jne .L8_37 + + +.L8_39: + + SAVE2x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L8_40: + testq $1, M + jz .L8_100 // to next 3 lines of N + + ALIGN_4 + +.L8_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,8), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $8, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x8 + + sarq $3,%rax + je .L8_46 + + ALIGN_4 + +.L8_42: + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + dec %rax + jne .L8_42 + ALIGN_4 + +.L8_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L8_49 + + ALIGN_4 + +.L8_47: + + KERNEL1x8_SUB + + dec %rax + jne .L8_47 + ALIGN_4 + + +.L8_49: + + SAVE1x8 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 8), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L8_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK // number of values in B +#endif + + + decq J // j -- + jg .L8_10 + + + + + +/*************************************************************************************************/ +.L4_0: + movq Nmod12, J + testq $4, J + je .L2_0 + ALIGN_4 + +.L4_10: + movq C, CO1 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L4_20 + + ALIGN_4 + +.L4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + sarq $3, %rax // K / 8 + cmpq $2, %rax + jl .L4_13 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subq $2, %rax + je .L4_12a + + ALIGN_5 + +.L4_12: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + dec %rax + jne .L4_12 + +.L4_12a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_13: + + test $1, %rax + jz .L4_14 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + jmp .L4_16 + + +.L4_14: + + INIT4x4 + + +.L4_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_19 + + ALIGN_4 + +.L4_17: + + KERNEL4x4_SUB + + dec %rax + jne .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $3, M + jz .L4_100 // to next 16 lines of N + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x4 + + sarq $3, %rax + je .L4_36 + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + dec %rax + jne .L4_32 + ALIGN_4 + +.L4_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_39 + + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + dec %rax + jne .L4_37 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L4_40: + testq $1, M + jz .L4_100 // to next 3 lines of N + + ALIGN_4 + +.L4_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,4), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x4 + + sarq $3,%rax + je .L4_46 + + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + dec %rax + jne .L4_42 + ALIGN_4 + +.L4_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L4_49 + + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + dec %rax + jne .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 4), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + +.L4_100: + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK // number of values in B +#endif + + + movq K, %rax + salq $2, %rax // * 4 + leaq (B , %rax, SIZE), B + + + + +/***************************************************************************************************************/ + +.L2_0: + + movq Nmod12, J + testq $2, J + je .L1_0 + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x2 + + sarq $3, %rax // K / 8 + + je .L2_16 + + ALIGN_5 + +.L2_12: + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + dec %rax + jne .L2_12 + + +.L2_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + ALIGN_4 + +.L2_17: + + KERNEL4x2_SUB + + dec %rax + jne .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $3, M + jz .L2_100 // to next 16 lines of N + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x2 + + sarq $3, %rax + je .L2_36 + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + dec %rax + jne .L2_32 + +.L2_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + dec %rax + jne .L2_37 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax + SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L2_40: + testq $1, M + jz .L2_100 // to next 3 lines of N + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,2), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x2 + + sarq $3,%rax + je .L2_46 + + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + dec %rax + jne .L2_42 + +.L2_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + dec %rax + jne .L2_47 + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 2), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + +.L2_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK // number of values in B +#endif + + movq K, %rax + salq $1, %rax // * 2 + leaq (B , %rax, SIZE), B + +/***************************************************************************************************************/ + +.L1_0: + + movq Nmod12, J + testq $1, J + je .L999 + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $2, I // i = m / 4 + je .L1_20 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,4), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT4x1 + + sarq $3, %rax // K / 8 + je .L1_16 + + ALIGN_5 + +.L1_12: + + KERNEL4x1 + + dec %rax + jne .L1_12 + + +.L1_16: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + ALIGN_4 + +.L1_17: + + KERNEL4x1_SUB + + dec %rax + jne .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 4), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK // number of values in A +#endif + + + decq I # i -- + jg .L1_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $3, M + jz .L1_100 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,2), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT2x1 + + sarq $3, %rax + je .L1_36 + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + + dec %rax + jne .L1_32 + +.L1_36: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + dec %rax + jne .L1_37 + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 2), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK // number of values in A +#endif + + +.L1_40: + testq $1, M + jz .L1_100 // to next 3 lines of N + + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO + addq $12 * SIZE, BO +#else + movq B, BO + addq $12 * SIZE, BO + movq KK, %rax + salq $3, %rax // rax * SIZE + leaq (BO,%rax,1), BO // add number of values in B + leaq (AO,%rax,1), AO // add number of values in A +#endif + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + INIT1x1 + + sarq $3,%rax + je .L1_46 + + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + dec %rax + jne .L1_42 + +.L1_46: + movq KKK, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + dec %rax + jne .L1_47 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $3, %rax // rax * SIZE + leaq (BO, %rax, 1), BO // number of values in B + leaq (AO, %rax, 1), AO // number of values in A +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK // number of values in A +#endif + + + +.L1_100: + + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK // number of values in B +#endif + + + +.L999: + + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c index 90a4c2b1de..a5daffb94b 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c +++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c @@ -1,670 +1,670 @@ -#include "common.h" -#include -#include - -//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. - -/* row-major c_block */ -#define INNER_KERNEL_k1m1n8 \ - "prefetcht0 384(%1);"\ - "vmovupd (%1),%%zmm5; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" - -#define INNER_KERNEL_k1m2n8 \ - INNER_KERNEL_k1m1n8\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" - -#define INNER_KERNEL_k1m1n16 \ - "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ - "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" - -#define INNER_KERNEL_k1m2n16 \ - INNER_KERNEL_k1m1n16\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" - -#define INNER_KERNEL_k1m1n24 \ - "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ - "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ - "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" - -#define INNER_KERNEL_k1m2n24 \ - INNER_KERNEL_k1m1n24\ - "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" - -/* row-major z-partition c_block */ -#define INNER_KERNEL_k1m4n8 \ - "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ - "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ - "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" - -#define INNER_KERNEL_k1m4n16 \ - INNER_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ - "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" - -#define INNER_KERNEL_k1m4n24 \ - INNER_KERNEL_k1m4n16\ - "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ - "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" - -#define INNER_KERNEL_k1m8n8 \ - "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ - "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ - "prefetcht0 128(%1);"\ - "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ - "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" - -#define INNER_KERNEL_k1m8n16 \ - INNER_KERNEL_k1m8n8\ - "prefetcht0 128(%1,%%r12,2);"\ - "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ - "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" - -#define INNER_KERNEL_k1m8n24 \ - INNER_KERNEL_k1m8n16\ - "prefetcht0 128(%1,%%r12,4);"\ - "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ - "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ - "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ - "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" - -/* micro kernels */ -#define INNER_KERNELm1(nn) \ - "cmpq $1,%2;jb "#nn"3f;"\ - #nn"4:\n\t"\ - INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ - "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ - #nn"3:\n\t" - -#define INNER_KERNELm2(nn) \ - "cmpq $1,%2;jb "#nn"0f;"\ - #nn"1:\n\t"\ - INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ - "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ - #nn"0:\n\t" - -#define INNER_KERNELm4(nn) \ - "cmpq $1,%2;jb "#nn"00f;"\ - #nn"01:\n\t"\ - INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ - "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ - #nn"00:\n\t" - -/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ -#define INNER_KERNELm8(nn) \ - "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ - #nn"008:\n\t"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "prefetcht1 (%11); addq $32,%11;"\ - "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ - "movq %3,%10;"\ - #nn"001:\n\t"\ - "cmpq $1,%2;jb "#nn"000f;"\ - "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ - INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ - "decq %2;jmp "#nn"001b;"\ - ""#nn"000:\n\t" - -#define INNER_INIT_m1n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8;" - -#define INNER_INIT_m2n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" - -#define INNER_INIT_m4n8 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" - -#define INNER_INIT_m8n8 \ - INNER_INIT_m4n8\ - "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" - -#define INNER_INIT_m1n16 INNER_INIT_m2n8 - -#define INNER_INIT_m2n16 INNER_INIT_m4n8 - -#define INNER_INIT_m4n16 INNER_INIT_m8n8 - -#define INNER_INIT_m8n16 \ - INNER_INIT_m8n8\ - "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ - "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" - -#define INNER_INIT_m1n24 \ - "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" - -#define INNER_INIT_m2n24 \ - INNER_INIT_m1n24\ - "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" - -#define INNER_INIT_m4n24 \ - INNER_INIT_m4n16\ - "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" - -#define INNER_INIT_m8n24 \ - INNER_INIT_m8n16\ - "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ - "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" - -#define INNER_SETINDEX \ - "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ - "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ - "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" - -#define INNER_STORE_m1n8(c1,disp) \ - "kxnorw %%k1,%%k1,%%k1;"\ - "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ - "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ - "kxnorw %%k1,%%k1,%%k1;"\ - "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" - -#define INNER_SAVE_m1n8 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0) - -#define INNER_SAVE_m1n16 \ - INNER_SAVE_m1n8\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0) - -#define INNER_SAVE_m1n24 \ - INNER_SAVE_m1n16\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm10,0) - -#define INNER_SAVE_m2n8 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm9,8) - -#define INNER_SAVE_m2n16 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm10,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0)\ - INNER_STORE_m1n8(%%zmm11,8) - -#define INNER_SAVE_m2n24 \ - "movq %3,%10;"\ - INNER_SETINDEX\ - INNER_STORE_m1n8(%%zmm8,0)\ - INNER_STORE_m1n8(%%zmm11,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm9,0)\ - INNER_STORE_m1n8(%%zmm12,8)\ - "leaq (%10,%4,8),%10;"\ - INNER_STORE_m1n8(%%zmm10,0)\ - INNER_STORE_m1n8(%%zmm13,8) - -#define INNER_TRANS_4x8(c1,c2,c3,c4) \ - "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ - "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ - "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ - "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ - -#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ - "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ - "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ - "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ - "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" - -#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) - -//%7 for k01(input) only when m=4 -#define INNER_STORE_4x8(c1,c2,c3,c4) \ - "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ - "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ - "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ - "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ - "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ - "leaq (%10,%4,4),%10;" - -#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ - "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ - "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" - -#define INNER_SAVE_m4n8 \ - "movq %3,%10;"\ - INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ - INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) - -#define INNER_SAVE_m4n16 \ - INNER_SAVE_m4n8\ - INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ - INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) - -#define INNER_SAVE_m4n24 \ - INNER_SAVE_m4n16\ - INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ - INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) - -#define INNER_SAVE_m8n8 \ - "movq %3,%10;"\ - INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ - INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) - -#define INNER_SAVE_m8n16 \ - INNER_SAVE_m8n8\ - INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ - INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) - -#define INNER_SAVE_m8n24 \ - INNER_SAVE_m8n16\ - INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ - INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) - -#define COMPUTE_n8 {\ - b_pref = packed_b_pointer + 8 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 42222f;"\ - "42221:\n\t"\ - INNER_INIT_m8n8\ - INNER_KERNELm8(8)\ - INNER_SAVE_m8n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ - "42222:\n\t"\ - "cmpq $4,%8; jb 42223f;"\ - INNER_INIT_m4n8\ - INNER_KERNELm4(8)\ - INNER_SAVE_m4n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "42223:\n\t"\ - "cmpq $2,%8; jb 42224f;"\ - INNER_INIT_m2n8\ - INNER_KERNELm2(8)\ - INNER_SAVE_m2n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "42224:\n\t"\ - "cmpq $1,%8; jb 42225f;"\ - INNER_INIT_m1n8\ - INNER_KERNELm1(8)\ - INNER_SAVE_m1n8\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "42225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -#define COMPUTE_n16 {\ - b_pref = packed_b_pointer + 16 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 32222f;"\ - "32221:\n\t"\ - INNER_INIT_m8n16\ - INNER_KERNELm8(16)\ - INNER_SAVE_m8n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ - "32222:\n\t"\ - "cmpq $4,%8; jb 32223f;"\ - INNER_INIT_m4n16\ - INNER_KERNELm4(16)\ - INNER_SAVE_m4n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "32223:\n\t"\ - "cmpq $2,%8; jb 32224f;"\ - INNER_INIT_m2n16\ - INNER_KERNELm2(16)\ - INNER_SAVE_m2n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "32224:\n\t"\ - "cmpq $1,%8; jb 32225f;"\ - INNER_INIT_m1n16\ - INNER_KERNELm1(16)\ - INNER_SAVE_m1n16\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "32225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ - "leaq (%1,%%r12,4),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ - ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ - "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -#define COMPUTE_n24 {\ - b_pref = packed_b_pointer + 24 * K;\ - __asm__ __volatile__(\ - "vbroadcastsd (%9),%%zmm3;"\ - "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ - "cmpq $8,%8; jb 22222f;"\ - "22221:\n\t"\ - INNER_INIT_m8n24\ - INNER_KERNELm8(24)\ - INNER_SAVE_m8n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ - "addq $64,%3;"\ - "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ - "22222:\n\t"\ - "cmpq $4,%8; jb 22223f;"\ - INNER_INIT_m4n24\ - INNER_KERNELm4(24)\ - INNER_SAVE_m4n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $32,%3;"\ - "subq $4,%8;"\ - "22223:\n\t"\ - "cmpq $2,%8; jb 22224f;"\ - INNER_INIT_m2n24\ - INNER_KERNELm2(24)\ - INNER_SAVE_m2n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $16,%3;"\ - "subq $2,%8;"\ - "22224:\n\t"\ - "cmpq $1,%8; jb 22225f;"\ - INNER_INIT_m1n24\ - INNER_KERNELm1(24)\ - INNER_SAVE_m1n24\ - "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ - "addq $8,%3;"\ - "22225:\n\t"\ - "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ - "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ - "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ - :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ - "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ - "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ - "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ - a_block_pointer -= M * K;\ -} -static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 -//perform C += A B - if(k==0 || m==0 || ndiv8==0) return; - int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); - int64_t K = (int64_t)k; int64_t M = (int64_t)m; - double *a_block_pointer,*b_pref; - double *c_pointer = c,*c_store = c; - __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; - BLASLONG ndiv8_count; - double *packed_b_pointer = packed_b; - a_block_pointer = packed_a; - for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ - COMPUTE_n24 - } - for(;ndiv8_count>1;ndiv8_count-=2){ - COMPUTE_n16 - } - if(ndiv8_count>0){ - COMPUTE_n8 - } -} - -/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ -/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ -/* double accumulator: sc1; temporary variables: sa1,sb1 */ -/* column-major c_block */ -#define KERNEL_m4n4k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ - yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ - b_block_pointer+=4;\ -} -#define KERNEL_m4n2k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ - b_block_pointer+=2;\ -} -#define KERNEL_m4n1k1 {\ - ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ - yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - b_block_pointer++;\ -} -#define INIT_m4n1 yc1=_mm256_setzero_pd(); -#define INIT_m4n2 yc2=INIT_m4n1 -#define INIT_m4n4 yc4=yc3=INIT_m4n2 -#define SAVE_m4n1 {\ - yb1 = _mm256_broadcast_sd(alpha);\ - ya1 = _mm256_loadu_pd(c_pointer);\ - yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ - _mm256_storeu_pd(c_pointer,yc1);\ - c_pointer += 4;\ -} -#define SAVE_m4n2 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ - c_pointer += 4;\ -} -#define SAVE_m4n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ - c_pointer += LDC*2;\ - yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ - yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ - _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ - c_pointer += 4-LDC*2;\ -} -#define KERNEL_m2n2k1 {\ - xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ - xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ - b_block_pointer += 2;\ -} -#define KERNEL_m2n1k1 {\ - xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ - xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - b_block_pointer ++;\ -} -#define INIT_m2n1 xc1=_mm_setzero_pd(); -#define INIT_m2n2 xc2=INIT_m2n1 -#define SAVE_m2n1 {\ - xb1 = _mm_loaddup_pd(alpha);\ - xa1 = _mm_loadu_pd(c_pointer);\ - xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ - _mm_storeu_pd(c_pointer,xc1);\ - c_pointer += 2;\ -} -#define SAVE_m2n2 {\ - xa1 = _mm_loaddup_pd(alpha);\ - xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ - xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ - _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ - c_pointer += 2;\ -} -#define KERNEL_m1n1k1 {\ - sa1 = *a_block_pointer; a_block_pointer++;\ - sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ - b_block_pointer ++;\ -} -#define INIT_m1n1 sc1=0.0; -#define SAVE_m1n1 {\ - *c_pointer += sc1 * (*alpha);\ - c_pointer++;\ -} -/* row-major c_block */ -#define KERNEL_m2n4k1 {\ - yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ - ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ - a_block_pointer += 2;\ -} -#define KERNEL_m1n4k1 {\ - yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ - ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ - a_block_pointer ++;\ -} -#define KERNEL_m1n2k1 {\ - xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ - xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ - a_block_pointer ++;\ -} -#define INIT_m1n2 INIT_m2n1 -#define INIT_m1n4 INIT_m4n1 -#define INIT_m2n4 INIT_m4n2 -#define SAVE_m2n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yc1 = _mm256_mul_pd(yc1,ya1);\ - yc2 = _mm256_mul_pd(yc2,ya1);\ - yb1 = _mm256_unpacklo_pd(yc1,yc2);\ - yb2 = _mm256_unpackhi_pd(yc1,yc2);\ - xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ - xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ - _mm_storeu_pd(c_pointer,xb1);\ - _mm_storeu_pd(c_pointer+LDC,xb2);\ - xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ - xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ - _mm_storeu_pd(c_pointer+2*LDC,xb1);\ - _mm_storeu_pd(c_pointer+3*LDC,xb2);\ - c_pointer += 2;\ -} -#define SAVE_m1n2 {\ - xb1 = _mm_loaddup_pd(alpha);\ - xc1 = _mm_mul_pd(xc1,xb1);\ - *c_pointer += _mm_cvtsd_f64(xc1);\ - xa1 = _mm_unpackhi_pd(xc1,xc1);\ - c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ - c_pointer ++;\ -} -#define SAVE_m1n4 {\ - ya1 = _mm256_broadcast_sd(alpha);\ - yc1 = _mm256_mul_pd(yc1,ya1);\ - xb1 = _mm256_extractf128_pd(yc1,0);\ - *c_pointer += _mm_cvtsd_f64(xb1);\ - xb2 = _mm_unpackhi_pd(xb1,xb1);\ - c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ - xb1 = _mm256_extractf128_pd(yc1,1);\ - c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ - xb2 = _mm_unpackhi_pd(xb1,xb1);\ - c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ - c_pointer ++;\ -} -static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 -//perform C += A B , edge_n<8 must be satisfied. - if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; - double *a_block_pointer,*b_block_pointer,*b_base_pointer; - double *c_pointer = c; - __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; - __m128d xc1,xc2,xa1,xb1,xb2; - double sc1,sa1,sb1; - BLASLONG m_count,n_count,k_count; - b_base_pointer = packed_b; -//now start calculation of the edge part - for(n_count=edge_n;n_count>3;n_count-=4){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n4 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n4 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n4 - for(k_count=0;k_count1;n_count-=2){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n2 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n2 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n2 - for(k_count=0;k_count0){ - a_block_pointer = packed_a; - for(m_count=m;m_count>3;m_count-=4){ - b_block_pointer = b_base_pointer; - INIT_m4n1 - for(k_count=0;k_count1;m_count-=2){ - b_block_pointer = b_base_pointer; - INIT_m2n1 - for(k_count=0;k_count0){ - b_block_pointer = b_base_pointer; - INIT_m1n1 - for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); - if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); - return 0; -} +#include "common.h" +#include +#include + +//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. + +/* row-major c_block */ +#define INNER_KERNEL_k1m1n8 \ + "prefetcht0 384(%1);"\ + "vmovupd (%1),%%zmm5; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;" + +#define INNER_KERNEL_k1m2n8 \ + INNER_KERNEL_k1m1n8\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m1n16 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;" + +#define INNER_KERNEL_k1m2n16 \ + INNER_KERNEL_k1m1n16\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" + +#define INNER_KERNEL_k1m1n24 \ + "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ + "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ + "vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;" + +#define INNER_KERNEL_k1m2n24 \ + INNER_KERNEL_k1m1n24\ + "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" + +/* row-major z-partition c_block */ +#define INNER_KERNEL_k1m4n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ + "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ + "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" + +#define INNER_KERNEL_k1m4n16 \ + INNER_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ + "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" + +#define INNER_KERNEL_k1m4n24 \ + INNER_KERNEL_k1m4n16\ + "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ + "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" + +#define INNER_KERNEL_k1m8n8 \ + "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ + "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ + "prefetcht0 128(%1);"\ + "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ + "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" + +#define INNER_KERNEL_k1m8n16 \ + INNER_KERNEL_k1m8n8\ + "prefetcht0 128(%1,%%r12,2);"\ + "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ + "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" + +#define INNER_KERNEL_k1m8n24 \ + INNER_KERNEL_k1m8n16\ + "prefetcht0 128(%1,%%r12,4);"\ + "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ + "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ + "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ + "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" + +/* micro kernels */ +#define INNER_KERNELm1(nn) \ + "cmpq $1,%2;jb "#nn"3f;"\ + #nn"4:\n\t"\ + INNER_KERNEL_k1m1n##nn "addq $8,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"4b;"\ + #nn"3:\n\t" + +#define INNER_KERNELm2(nn) \ + "cmpq $1,%2;jb "#nn"0f;"\ + #nn"1:\n\t"\ + INNER_KERNEL_k1m2n##nn "addq $16,%0;"\ + "decq %2;cmpq $1,%2;jnb "#nn"1b;"\ + #nn"0:\n\t" + +#define INNER_KERNELm4(nn) \ + "cmpq $1,%2;jb "#nn"00f;"\ + #nn"01:\n\t"\ + INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ + "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ + #nn"00:\n\t" + +/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ +#define INNER_KERNELm8(nn) \ + "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ + #nn"008:\n\t"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "prefetcht1 (%11); addq $32,%11;"\ + "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ + "movq %3,%10;"\ + #nn"001:\n\t"\ + "cmpq $1,%2;jb "#nn"000f;"\ + "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ + INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ + "decq %2;jmp "#nn"001b;"\ + ""#nn"000:\n\t" + +#define INNER_INIT_m1n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8;" + +#define INNER_INIT_m2n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;" + +#define INNER_INIT_m4n8 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;" + +#define INNER_INIT_m8n8 \ + INNER_INIT_m4n8\ + "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;" + +#define INNER_INIT_m1n16 INNER_INIT_m2n8 + +#define INNER_INIT_m2n16 INNER_INIT_m4n8 + +#define INNER_INIT_m4n16 INNER_INIT_m8n8 + +#define INNER_INIT_m8n16 \ + INNER_INIT_m8n8\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\ + "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;" + +#define INNER_INIT_m1n24 \ + "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;" + +#define INNER_INIT_m2n24 \ + INNER_INIT_m1n24\ + "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;" + +#define INNER_INIT_m4n24 \ + INNER_INIT_m4n16\ + "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;" + +#define INNER_INIT_m8n24 \ + INNER_INIT_m8n16\ + "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\ + "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;" + +#define INNER_SETINDEX \ + "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\ + "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\ + "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};" + +#define INNER_STORE_m1n8(c1,disp) \ + "kxnorw %%k1,%%k1,%%k1;"\ + "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\ + "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\ + "kxnorw %%k1,%%k1,%%k1;"\ + "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};" + +#define INNER_SAVE_m1n8 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0) + +#define INNER_SAVE_m1n16 \ + INNER_SAVE_m1n8\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0) + +#define INNER_SAVE_m1n24 \ + INNER_SAVE_m1n16\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm10,0) + +#define INNER_SAVE_m2n8 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm9,8) + +#define INNER_SAVE_m2n16 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm10,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm11,8) + +#define INNER_SAVE_m2n24 \ + "movq %3,%10;"\ + INNER_SETINDEX\ + INNER_STORE_m1n8(%%zmm8,0)\ + INNER_STORE_m1n8(%%zmm11,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm9,0)\ + INNER_STORE_m1n8(%%zmm12,8)\ + "leaq (%10,%4,8),%10;"\ + INNER_STORE_m1n8(%%zmm10,0)\ + INNER_STORE_m1n8(%%zmm13,8) + +#define INNER_TRANS_4x8(c1,c2,c3,c4) \ + "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ + "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ + "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ + "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ + +#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ + "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ + "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ + "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ + "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" + +#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) + +//%7 for k01(input) only when m=4 +#define INNER_STORE_4x8(c1,c2,c3,c4) \ + "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\ + "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\ + "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\ + "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\ + "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\ + "leaq (%10,%4,4),%10;" + +#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ + "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\ + "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;" + +#define INNER_SAVE_m4n8 \ + "movq %3,%10;"\ + INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\ + INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) + +#define INNER_SAVE_m4n16 \ + INNER_SAVE_m4n8\ + INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m4n24 \ + INNER_SAVE_m4n16\ + INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ + INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) + +#define INNER_SAVE_m8n8 \ + "movq %3,%10;"\ + INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ + INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) + +#define INNER_SAVE_m8n16 \ + INNER_SAVE_m8n8\ + INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ + INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) + +#define INNER_SAVE_m8n24 \ + INNER_SAVE_m8n16\ + INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ + INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) + +#define COMPUTE_n8 {\ + b_pref = packed_b_pointer + 8 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 42222f;"\ + "42221:\n\t"\ + INNER_INIT_m8n8\ + INNER_KERNELm8(8)\ + INNER_SAVE_m8n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 42221b;"\ + "42222:\n\t"\ + "cmpq $4,%8; jb 42223f;"\ + INNER_INIT_m4n8\ + INNER_KERNELm4(8)\ + INNER_SAVE_m4n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "42223:\n\t"\ + "cmpq $2,%8; jb 42224f;"\ + INNER_INIT_m2n8\ + INNER_KERNELm2(8)\ + INNER_SAVE_m2n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "42224:\n\t"\ + "cmpq $1,%8; jb 42225f;"\ + INNER_INIT_m1n8\ + INNER_KERNELm1(8)\ + INNER_SAVE_m1n8\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "42225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n16 {\ + b_pref = packed_b_pointer + 16 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 32222f;"\ + "32221:\n\t"\ + INNER_INIT_m8n16\ + INNER_KERNELm8(16)\ + INNER_SAVE_m8n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 32221b;"\ + "32222:\n\t"\ + "cmpq $4,%8; jb 32223f;"\ + INNER_INIT_m4n16\ + INNER_KERNELm4(16)\ + INNER_SAVE_m4n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "32223:\n\t"\ + "cmpq $2,%8; jb 32224f;"\ + INNER_INIT_m2n16\ + INNER_KERNELm2(16)\ + INNER_SAVE_m2n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "32224:\n\t"\ + "cmpq $1,%8; jb 32225f;"\ + INNER_INIT_m1n16\ + INNER_KERNELm1(16)\ + INNER_SAVE_m1n16\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "32225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $4,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ + ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ + "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +#define COMPUTE_n24 {\ + b_pref = packed_b_pointer + 24 * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%9),%%zmm3;"\ + "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\ + "cmpq $8,%8; jb 22222f;"\ + "22221:\n\t"\ + INNER_INIT_m8n24\ + INNER_KERNELm8(24)\ + INNER_SAVE_m8n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\ + "addq $64,%3;"\ + "subq $8,%8; cmpq $8,%8; jnb 22221b;"\ + "22222:\n\t"\ + "cmpq $4,%8; jb 22223f;"\ + INNER_INIT_m4n24\ + INNER_KERNELm4(24)\ + INNER_SAVE_m4n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $32,%3;"\ + "subq $4,%8;"\ + "22223:\n\t"\ + "cmpq $2,%8; jb 22224f;"\ + INNER_INIT_m2n24\ + INNER_KERNELm2(24)\ + INNER_SAVE_m2n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $16,%3;"\ + "subq $2,%8;"\ + "22224:\n\t"\ + "cmpq $1,%8; jb 22225f;"\ + INNER_INIT_m1n24\ + INNER_KERNELm1(24)\ + INNER_SAVE_m1n24\ + "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\ + "addq $8,%3;"\ + "22225:\n\t"\ + "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\ + "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ + "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ + :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ + "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ + "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ + "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ + a_block_pointer -= M * K;\ +} +static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 +//perform C += A B + if(k==0 || m==0 || ndiv8==0) return; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double); + int64_t K = (int64_t)k; int64_t M = (int64_t)m; + double *a_block_pointer,*b_pref; + double *c_pointer = c,*c_store = c; + __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033; + BLASLONG ndiv8_count; + double *packed_b_pointer = packed_b; + a_block_pointer = packed_a; + for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){ + COMPUTE_n24 + } + for(;ndiv8_count>1;ndiv8_count-=2){ + COMPUTE_n16 + } + if(ndiv8_count>0){ + COMPUTE_n8 + } +} + +/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */ +/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */ +/* double accumulator: sc1; temporary variables: sa1,sb1 */ +/* column-major c_block */ +#define KERNEL_m4n4k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\ + b_block_pointer+=4;\ +} +#define KERNEL_m4n2k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\ + b_block_pointer+=2;\ +} +#define KERNEL_m4n1k1 {\ + ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\ + yb1 = _mm256_broadcast_sd(b_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + b_block_pointer++;\ +} +#define INIT_m4n1 yc1=_mm256_setzero_pd(); +#define INIT_m4n2 yc2=INIT_m4n1 +#define INIT_m4n4 yc4=yc3=INIT_m4n2 +#define SAVE_m4n1 {\ + yb1 = _mm256_broadcast_sd(alpha);\ + ya1 = _mm256_loadu_pd(c_pointer);\ + yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\ + _mm256_storeu_pd(c_pointer,yc1);\ + c_pointer += 4;\ +} +#define SAVE_m4n2 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += 4;\ +} +#define SAVE_m4n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\ + c_pointer += LDC*2;\ + yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\ + yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\ + _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\ + c_pointer += 4-LDC*2;\ +} +#define KERNEL_m2n2k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\ + b_block_pointer += 2;\ +} +#define KERNEL_m2n1k1 {\ + xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\ + xb1 = _mm_loaddup_pd(b_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + b_block_pointer ++;\ +} +#define INIT_m2n1 xc1=_mm_setzero_pd(); +#define INIT_m2n2 xc2=INIT_m2n1 +#define SAVE_m2n1 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xa1 = _mm_loadu_pd(c_pointer);\ + xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\ + _mm_storeu_pd(c_pointer,xc1);\ + c_pointer += 2;\ +} +#define SAVE_m2n2 {\ + xa1 = _mm_loaddup_pd(alpha);\ + xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\ + xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\ + _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\ + c_pointer += 2;\ +} +#define KERNEL_m1n1k1 {\ + sa1 = *a_block_pointer; a_block_pointer++;\ + sb1 = *b_block_pointer; sc1 += sa1 * sb1;\ + b_block_pointer ++;\ +} +#define INIT_m1n1 sc1=0.0; +#define SAVE_m1n1 {\ + *c_pointer += sc1 * (*alpha);\ + c_pointer++;\ +} +/* row-major c_block */ +#define KERNEL_m2n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\ + a_block_pointer += 2;\ +} +#define KERNEL_m1n4k1 {\ + yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\ + ya1 = _mm256_broadcast_sd(a_block_pointer); yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\ + a_block_pointer ++;\ +} +#define KERNEL_m1n2k1 {\ + xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\ + xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\ + a_block_pointer ++;\ +} +#define INIT_m1n2 INIT_m2n1 +#define INIT_m1n4 INIT_m4n1 +#define INIT_m2n4 INIT_m4n2 +#define SAVE_m2n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + yc2 = _mm256_mul_pd(yc2,ya1);\ + yb1 = _mm256_unpacklo_pd(yc1,yc2);\ + yb2 = _mm256_unpackhi_pd(yc1,yc2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\ + _mm_storeu_pd(c_pointer,xb1);\ + _mm_storeu_pd(c_pointer+LDC,xb2);\ + xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\ + xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\ + _mm_storeu_pd(c_pointer+2*LDC,xb1);\ + _mm_storeu_pd(c_pointer+3*LDC,xb2);\ + c_pointer += 2;\ +} +#define SAVE_m1n2 {\ + xb1 = _mm_loaddup_pd(alpha);\ + xc1 = _mm_mul_pd(xc1,xb1);\ + *c_pointer += _mm_cvtsd_f64(xc1);\ + xa1 = _mm_unpackhi_pd(xc1,xc1);\ + c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\ + c_pointer ++;\ +} +#define SAVE_m1n4 {\ + ya1 = _mm256_broadcast_sd(alpha);\ + yc1 = _mm256_mul_pd(yc1,ya1);\ + xb1 = _mm256_extractf128_pd(yc1,0);\ + *c_pointer += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC] += _mm_cvtsd_f64(xb2);\ + xb1 = _mm256_extractf128_pd(yc1,1);\ + c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\ + xb2 = _mm_unpackhi_pd(xb1,xb1);\ + c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\ + c_pointer ++;\ +} +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 +//perform C += A B , edge_n<8 must be satisfied. + if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return; + double *a_block_pointer,*b_block_pointer,*b_base_pointer; + double *c_pointer = c; + __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2; + __m128d xc1,xc2,xa1,xb1,xb2; + double sc1,sa1,sb1; + BLASLONG m_count,n_count,k_count; + b_base_pointer = packed_b; +//now start calculation of the edge part + for(n_count=edge_n;n_count>3;n_count-=4){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n4 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n4 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n4 + for(k_count=0;k_count1;n_count-=2){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n2 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n2 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n2 + for(k_count=0;k_count0){ + a_block_pointer = packed_a; + for(m_count=m;m_count>3;m_count-=4){ + b_block_pointer = b_base_pointer; + INIT_m4n1 + for(k_count=0;k_count1;m_count-=2){ + b_block_pointer = b_base_pointer; + INIT_m2n1 + for(k_count=0;k_count0){ + b_block_pointer = b_base_pointer; + INIT_m1n1 + for(k_count=0;k_count0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA); + if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA); + return 0; +} diff --git a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S index 40c5892c62..c353a59136 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S @@ -1,4413 +1,4413 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -/********************************************************************* -* 2013/06/02 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 360 -* DGEMM_Q 160 -* -* Performance at m x n without prefetch of BO: -* -* 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) -* 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) -* 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) -* -* 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) -* 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) -* 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) -* -*********************************************************************/ - -/********************************************************************* -* 2013/06/03 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 336 -* DGEMM_Q 168 -* NO_WARMUP 1 -* NO_AFFINITY 1 -* GEMM_MULTITHREAD_THRESHOLD 4 -* -* Performance at m x n with prefetch of BO: -* -* 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) -* 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) -* 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) -* -* 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) -* 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) -* 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) -* -*********************************************************************/ - -/********************************************************************* -* 2013/06/04 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 384 -* DGEMM_Q 168 -* NO_WARMUP 1 -* NO_AFFINITY 1 -* GEMM_MULTITHREAD_THRESHOLD 4 -* -* Performance at m x n with prefetch of BO: -* -* 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) -* 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) -* 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) -* -* 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) -* 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) -* 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL8x3_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - - - - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - - - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - - - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 - movq B, BO1 - leaq (B,%rax,8), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L6_02a - ALIGN_4 - -.L6_02: - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm2 - vmovups 4*SIZE(BO1), %xmm4 - vmovups 6*SIZE(BO1), %xmm6 - vmovsd (BO2), %xmm1 - vmovsd 2*SIZE(BO2), %xmm3 - vmovsd 4*SIZE(BO2), %xmm5 - vmovsd 6*SIZE(BO2), %xmm7 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - decq %rax - jnz .L6_02 - -.L6_02a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_02c - ALIGN_4 - -.L6_02b: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax,8), BO1 // next offset to BO1 - leaq (BO1,%rax,8), BO2 // next offset to BO1 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $2, %rax // k / 4 - jz .L6_03a - ALIGN_4 - - -.L6_03: - - prefetcht0 512(BO2) - prefetchw 512(BO) - vmovups (BO2), %xmm0 - vmovups 2*SIZE(BO2), %xmm2 - vmovups 4*SIZE(BO2), %xmm4 - vmovups 6*SIZE(BO2), %xmm6 - vmovsd 1*SIZE(BO1), %xmm1 - vmovsd 3*SIZE(BO1), %xmm3 - vmovsd 5*SIZE(BO1), %xmm5 - vmovsd 7*SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - decq %rax - jnz .L6_03 - -.L6_03a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_03c - ALIGN_4 - - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $7, M - jz .L7_10 // to next 3 lines of N - - testq $4, M - jz .L6_30 - - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L7_20 - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_11 - ALIGN_4 - -.L7_20: - // Test rest of M - - testq $7, M - jz .L7_60 // to next 6 lines of N - - testq $4, M - jz .L7_30 - - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - -.L7_40: - testq $1, M - jz .L7_60 // to next 6 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - prefetcht0 B_PR1+64(BO,BI,8) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,8) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - - vmovsd %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_0: - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - vmulpd %xmm0, %xmm11,%xmm11 - vmulpd %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - vmulsd %xmm0, %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,8) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/********************************************************************* +* 2013/06/02 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 360 +* DGEMM_Q 160 +* +* Performance at m x n without prefetch of BO: +* +* 5760x5760 93.4 GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS) +* 5760x5760 84.2 GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS) +* 3840x3840 50.3 GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS) +* +* 5760x5760 56.4 GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS) +* 3840x3840 29.0 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) +* 3840x3840 26.1 GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS) +* +*********************************************************************/ + +/********************************************************************* +* 2013/06/03 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 336 +* DGEMM_Q 168 +* NO_WARMUP 1 +* NO_AFFINITY 1 +* GEMM_MULTITHREAD_THRESHOLD 4 +* +* Performance at m x n with prefetch of BO: +* +* 8064x3840 93.7 GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS) +* 6048x2880 85.1 GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS) +* 6048x2880 52.0 GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS) +* +* 6048x2880 56.3 GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS) +* 4032x1920 29.5 GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS) +* 4032x1920 26.9 GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS) +* +*********************************************************************/ + +/********************************************************************* +* 2013/06/04 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 384 +* DGEMM_Q 168 +* NO_WARMUP 1 +* NO_AFFINITY 1 +* GEMM_MULTITHREAD_THRESHOLD 4 +* +* Performance at m x n with prefetch of BO: +* +* 6144x5376 94.6 GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS) +* 6144x5376 86.0 GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS) +* 4608x4032 52.0 GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS) +* +* 6144x5376 57.3 GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS) +* 4608x4032 29.6 GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS) +* 4608x4032 26.9 GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL8x3_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddpd %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddpd %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 512(BO2) + prefetchw 512(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + prefetcht0 B_PR1+64(BO,BI,8) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,8) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,8) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S index adc00cca3c..48eb1bcbe1 100644 --- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S @@ -1,4523 +1,4523 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/********************************************************************* -* -* 2013/11/13 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/31 Saar -* -* Parameter: -* UNROLL_M 8 -* UNROLL_N 2 -* DGEMM_P 768 -* DGEMM_Q 168 -* DGEMM_R 12288 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) -* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) -* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) -* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior -* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior -* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) -* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) -* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) -* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 - -#else - -#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 - -#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 - -#endif - - - - -#define A_PR1 512 -#define B_PR1 256 -#define C_PR1 64 - -.macro INIT8x3 - vxorpd %xmm4 , %xmm4 , %xmm4 - vxorpd %xmm5 , %xmm5 , %xmm5 - vxorpd %xmm6 , %xmm6 , %xmm6 - vxorpd %xmm7 , %xmm7 , %xmm7 - vxorpd %xmm8 , %xmm8 , %xmm8 - vxorpd %xmm9 , %xmm9 , %xmm9 - vxorpd %xmm10, %xmm10, %xmm10 - vxorpd %xmm11, %xmm11, %xmm11 - vxorpd %xmm12, %xmm12, %xmm12 - vxorpd %xmm13, %xmm13, %xmm13 - vxorpd %xmm14, %xmm14, %xmm14 - vxorpd %xmm15, %xmm15, %xmm15 -.endm - -.macro KERNEL8x3_INIT - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - vmulpd %xmm1,%xmm0,%xmm4 - vmovddup -11 * SIZE(BO), %xmm2 - vmulpd %xmm2,%xmm0,%xmm5 - vmovddup -10 * SIZE(BO), %xmm3 - vmulpd %xmm3,%xmm0,%xmm6 - vmovups -14 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm7 - vmulpd %xmm2,%xmm0,%xmm8 - vmulpd %xmm3,%xmm0,%xmm9 - vmovups -12 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm10 - vmulpd %xmm2,%xmm0,%xmm11 - addq $ 3 * SIZE, BO - vmulpd %xmm3,%xmm0,%xmm12 - vmovups -10 * SIZE(AO), %xmm0 - vmulpd %xmm1,%xmm0,%xmm13 - vmovddup -12 * SIZE(BO), %xmm1 - vmulpd %xmm2,%xmm0,%xmm14 - vmovddup -11 * SIZE(BO), %xmm2 - vmulpd %xmm3,%xmm0,%xmm15 -.endm - - -.macro KERNEL8x3_M1 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -12 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M2 - vmovups -8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+64(AO) - vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -9 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -8 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - - -.macro KERNEL8x3_M3 - vmovups 0 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+128(AO) - vmovddup -7 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -6 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -5 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M4 - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup -4 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup -3 * SIZE(BO), %xmm1 - addq $ 32 * SIZE, AO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup -2 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M5 - vmovups -16 * SIZE(AO), %xmm0 - prefetcht0 A_PR1(AO) - vmovddup -1 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 0 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 1 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M6 - vmovups -8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+64(AO) - vmovddup 2 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 3 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 4 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - - -.macro KERNEL8x3_M7 - vmovups 0 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+128(AO) - vmovddup 5 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 2 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 4 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 6 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 6 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 7 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_M8 - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - vmovddup 9 * SIZE(BO), %xmm1 - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - vmovddup 10 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) - vmovddup 11 * SIZE(BO), %xmm3 - addq $ 32 * SIZE, AO - addq $ 24 * SIZE, BO -.endm - - -.macro KERNEL8x3_E - vmovups 8 * SIZE(AO), %xmm0 - prefetcht0 A_PR1+192(AO) - vmovddup 8 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups 10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups 12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups 14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $ 32 * SIZE, AO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $ 21 * SIZE, BO - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro KERNEL8x3_SUBN - vmovddup -12 * SIZE(BO), %xmm1 - vmovups -16 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) - vmovddup -11 * SIZE(BO), %xmm2 - VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) - vmovddup -10 * SIZE(BO), %xmm3 - VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) - vmovups -14 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) - vmovups -12 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) - VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) - VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) - vmovups -10 * SIZE(AO), %xmm0 - VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) - addq $ 3 * SIZE, BO - VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) - addq $ 8 * SIZE, AO - VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) -.endm - -.macro SAVE8x3 - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) - vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) - - prefetcht0 C_PR1(CO1) - prefetcht0 C_PR1(CO1,LDC) - prefetcht0 C_PR1(CO1,LDC,2) - - addq $ 8 * SIZE, CO1 # coffset += 8 -.endm - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL4x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - - - - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_2(xx) \ - vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL2x3_4(xx) \ - vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ - vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ - - - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL8x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL4x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_2(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL2x2_4(xx) \ - vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ - vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ - - - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,8) ;\ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,8) ;\ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,8) ;\ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL8x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,8) ;\ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL4x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_2(xx) \ - vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_3(xx) \ - vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL2x1_4(xx) \ - vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ - vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ - vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 - movq B, BO1 - leaq (B,%rax,8), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L6_02a - ALIGN_4 - -.L6_02: - prefetcht0 B_PR1(BO1) - prefetcht0 B_PR1(BO2) - prefetchw B_PR1(BO) - vmovups (BO1), %xmm0 - vmovups 2*SIZE(BO1), %xmm2 - vmovups 4*SIZE(BO1), %xmm4 - vmovups 6*SIZE(BO1), %xmm6 - vmovsd (BO2), %xmm1 - vmovsd 2*SIZE(BO2), %xmm3 - vmovsd 4*SIZE(BO2), %xmm5 - vmovsd 6*SIZE(BO2), %xmm7 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovups %xmm2, 3*SIZE(BO) - vmovsd %xmm3, 5*SIZE(BO) - vmovups %xmm4, 6*SIZE(BO) - vmovsd %xmm5, 8*SIZE(BO) - vmovups %xmm6, 9*SIZE(BO) - vmovsd %xmm7,11*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L6_02 - -.L6_02a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_02c - ALIGN_4 - -.L6_02b: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax,8), BO1 // next offset to BO1 - leaq (BO1,%rax,8), BO2 // next offset to BO1 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $2, %rax // k / 4 - jz .L6_03a - ALIGN_4 - - -.L6_03: - - prefetcht0 B_PR1(BO2) - prefetchw B_PR1(BO) - vmovups (BO2), %xmm0 - vmovups 2*SIZE(BO2), %xmm2 - vmovups 4*SIZE(BO2), %xmm4 - vmovups 6*SIZE(BO2), %xmm6 - vmovsd 1*SIZE(BO1), %xmm1 - vmovsd 3*SIZE(BO1), %xmm3 - vmovsd 5*SIZE(BO1), %xmm5 - vmovsd 7*SIZE(BO1), %xmm7 - vmovsd %xmm1, 0*SIZE(BO) - vmovups %xmm0, 1*SIZE(BO) - vmovsd %xmm3, 3*SIZE(BO) - vmovups %xmm2, 4*SIZE(BO) - vmovsd %xmm5, 6*SIZE(BO) - vmovups %xmm4, 7*SIZE(BO) - vmovsd %xmm7, 9*SIZE(BO) - vmovups %xmm6,10*SIZE(BO) - addq $ 8*SIZE,BO1 - addq $ 8*SIZE,BO2 - addq $ 12*SIZE,BO - decq %rax - jnz .L6_03 - -.L6_03a: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L6_03c - ALIGN_4 - - -.L6_03b: - - vmovsd 1*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 1*SIZE(BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO2 - addq $ 3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L6_20 - - ALIGN_4 - -.L6_11: - - leaq BUFFER1, BO // first buffer to BO - addq $12 * SIZE, BO - movq K, %rax - sarq $3, %rax // K / 8 - cmpq $3, %rax - jl .L6_13 - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - subq $2, %rax - - ALIGN_5 - -.L6_12: - - prefetcht0 B_PR1-24(BO) - prefetcht0 B_PR1+40(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - prefetcht0 B_PR1+104(BO) - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - dec %rax - jne .L6_12 - -.L6_12_E: - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L6_16 - -.L6_13: - - test $2, %rax - jz .L6_14 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L6_16 - - -.L6_14: - - test $1, %rax - jz .L6_15 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - - jmp .L6_16 - -.L6_15: - - INIT8x3 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL8x3_SUBN - dec %rax - jne .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE8x3 - - decq I # i -- - jg .L6_11 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $7, M - jz .L7_10 // to next 3 lines of N - - testq $4, M - jz .L6_30 - - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L7_20 - ALIGN_4 - -.L7_11: - - leaq BUFFER2, BO // first buffer to BO - addq $12 * SIZE, BO - movq K, %rax - sarq $3, %rax // K / 8 - cmpq $3, %rax - jl .L7_13 - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - prefetcht0 B_PR1+128(BO) - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - subq $2, %rax - - ALIGN_5 - -.L7_12: - - prefetcht0 B_PR1-24(BO) - prefetcht0 B_PR1+40(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - prefetcht0 B_PR1+104(BO) - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - dec %rax - jne .L7_12 - -.L7_12_E: - - prefetcht0 B_PR1(BO) - prefetcht0 B_PR1+64(BO) - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - - -.L7_13: - - test $2, %rax - jz .L7_14 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_M8 - - KERNEL8x3_M1 - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - -.L7_14: - - test $1, %rax - jz .L7_15 - - KERNEL8x3_INIT - KERNEL8x3_M2 - KERNEL8x3_M3 - KERNEL8x3_M4 - KERNEL8x3_M5 - KERNEL8x3_M6 - KERNEL8x3_M7 - KERNEL8x3_E - - jmp .L7_16 - - - -.L7_15: - - INIT8x3 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - - ALIGN_4 - -.L7_17: - - KERNEL8x3_SUBN - dec %rax - jne .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE8x3 - - decq I # i -- - jg .L7_11 - ALIGN_4 - -.L7_20: - // Test rest of M - - testq $7, M - jz .L7_60 // to next 6 lines of N - - testq $4, M - jz .L7_30 - - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovddup ALPHA, %xmm0 - - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - - -.L7_40: - testq $1, M - jz .L7_60 // to next 6 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovddup ALPHA, %xmm0 - - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - vmovsd %xmm6 , (CO1, LDC, 2) - - - addq $1 * SIZE, CO1 # coffset += 1 - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - - vmovsd %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_0: - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L2_20 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - vmulpd %xmm0, %xmm11,%xmm11 - vmulpd %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - vmovups %xmm11, 4 * SIZE(CO1, LDC) - vmovups %xmm14, 6 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $7, M - jz .L2_60 // to next 2 lines of N - - testq $4, M - jz .L2_30 - - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - - vmulpd %xmm0, %xmm5,%xmm5 - vmulpd %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - vmulsd %xmm0, %xmm5,%xmm5 - -#endif - - vmovsd %xmm4 , (CO1) - vmovsd %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $3, I // i = (m >> 3) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - vmulpd %xmm0, %xmm10,%xmm10 - vmulpd %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - vmovups %xmm10, 4 * SIZE(CO1) - vmovups %xmm13, 6 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - addq $8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $7, M - jz .L999 - - testq $4, M - jz .L1_30 - - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - vmulpd %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 2 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulpd %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, 8), AO - leaq (BO, BI, 8), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulsd %xmm0, %xmm4,%xmm4 - -#endif - - vmovsd %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, 8), BO - leaq (AO, %rax, 8), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#endif +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/********************************************************************* +* +* 2013/11/13 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/31 Saar +* +* Parameter: +* UNROLL_M 8 +* UNROLL_N 2 +* DGEMM_P 768 +* DGEMM_Q 168 +* DGEMM_R 12288 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS) +* 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS) +* 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior +* 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior +* 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS) +* 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS) +* 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS) +* 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0 + +#else + +#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0 + +#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0 + +#endif + + + + +#define A_PR1 512 +#define B_PR1 256 +#define C_PR1 64 + +.macro INIT8x3 + vxorpd %xmm4 , %xmm4 , %xmm4 + vxorpd %xmm5 , %xmm5 , %xmm5 + vxorpd %xmm6 , %xmm6 , %xmm6 + vxorpd %xmm7 , %xmm7 , %xmm7 + vxorpd %xmm8 , %xmm8 , %xmm8 + vxorpd %xmm9 , %xmm9 , %xmm9 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm15, %xmm15, %xmm15 +.endm + +.macro KERNEL8x3_INIT + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmulpd %xmm1,%xmm0,%xmm4 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm2,%xmm0,%xmm5 + vmovddup -10 * SIZE(BO), %xmm3 + vmulpd %xmm3,%xmm0,%xmm6 + vmovups -14 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm7 + vmulpd %xmm2,%xmm0,%xmm8 + vmulpd %xmm3,%xmm0,%xmm9 + vmovups -12 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm10 + vmulpd %xmm2,%xmm0,%xmm11 + addq $ 3 * SIZE, BO + vmulpd %xmm3,%xmm0,%xmm12 + vmovups -10 * SIZE(AO), %xmm0 + vmulpd %xmm1,%xmm0,%xmm13 + vmovddup -12 * SIZE(BO), %xmm1 + vmulpd %xmm2,%xmm0,%xmm14 + vmovddup -11 * SIZE(BO), %xmm2 + vmulpd %xmm3,%xmm0,%xmm15 +.endm + + +.macro KERNEL8x3_M1 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -12 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M2 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -8 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M3 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup -7 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -5 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M4 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup -4 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup -3 * SIZE(BO), %xmm1 + addq $ 32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup -2 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M5 + vmovups -16 * SIZE(AO), %xmm0 + prefetcht0 A_PR1(AO) + vmovddup -1 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 0 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 1 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M6 + vmovups -8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+64(AO) + vmovddup 2 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 3 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 4 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + + +.macro KERNEL8x3_M7 + vmovups 0 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+128(AO) + vmovddup 5 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 2 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 4 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 6 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 6 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 7 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_M8 + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + vmovddup 9 * SIZE(BO), %xmm1 + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + vmovddup 10 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) + vmovddup 11 * SIZE(BO), %xmm3 + addq $ 32 * SIZE, AO + addq $ 24 * SIZE, BO +.endm + + +.macro KERNEL8x3_E + vmovups 8 * SIZE(AO), %xmm0 + prefetcht0 A_PR1+192(AO) + vmovddup 8 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups 10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups 12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups 14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $ 32 * SIZE, AO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $ 21 * SIZE, BO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro KERNEL8x3_SUBN + vmovddup -12 * SIZE(BO), %xmm1 + vmovups -16 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm4 ) + vmovddup -11 * SIZE(BO), %xmm2 + VFMADD231PD_( %xmm2,%xmm0,%xmm5 ) + vmovddup -10 * SIZE(BO), %xmm3 + VFMADD231PD_( %xmm3,%xmm0,%xmm6 ) + vmovups -14 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm7 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm8 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm9 ) + vmovups -12 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm10 ) + VFMADD231PD_( %xmm2,%xmm0,%xmm11 ) + VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) + vmovups -10 * SIZE(AO), %xmm0 + VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) + addq $ 3 * SIZE, BO + VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) + addq $ 8 * SIZE, AO + VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) +.endm + +.macro SAVE8x3 + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 4 * SIZE(CO1, LDC, 2) + vmovups %xmm15, 6 * SIZE(CO1, LDC, 2) + + prefetcht0 C_PR1(CO1) + prefetcht0 C_PR1(CO1,LDC) + prefetcht0 C_PR1(CO1,LDC,2) + + addq $ 8 * SIZE, CO1 # coffset += 8 +.endm + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL4x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + + + + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_2(xx) \ + vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL2x3_4(xx) \ + vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\ + vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\ + + + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL8x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL4x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_2(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL2x2_4(xx) \ + vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\ + vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\ + + + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,8) ;\ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,8) ;\ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,8) ;\ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL8x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,8) ;\ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL4x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_2(xx) \ + vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_3(xx) \ + vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL2x1_4(xx) \ + vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\ + vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\ + vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\ + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 + movq B, BO1 + leaq (B,%rax,8), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L6_02a + ALIGN_4 + +.L6_02: + prefetcht0 B_PR1(BO1) + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO1), %xmm0 + vmovups 2*SIZE(BO1), %xmm2 + vmovups 4*SIZE(BO1), %xmm4 + vmovups 6*SIZE(BO1), %xmm6 + vmovsd (BO2), %xmm1 + vmovsd 2*SIZE(BO2), %xmm3 + vmovsd 4*SIZE(BO2), %xmm5 + vmovsd 6*SIZE(BO2), %xmm7 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovups %xmm2, 3*SIZE(BO) + vmovsd %xmm3, 5*SIZE(BO) + vmovups %xmm4, 6*SIZE(BO) + vmovsd %xmm5, 8*SIZE(BO) + vmovups %xmm6, 9*SIZE(BO) + vmovsd %xmm7,11*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L6_02 + +.L6_02a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_02c + ALIGN_4 + +.L6_02b: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax,8), BO1 // next offset to BO1 + leaq (BO1,%rax,8), BO2 // next offset to BO1 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $2, %rax // k / 4 + jz .L6_03a + ALIGN_4 + + +.L6_03: + + prefetcht0 B_PR1(BO2) + prefetchw B_PR1(BO) + vmovups (BO2), %xmm0 + vmovups 2*SIZE(BO2), %xmm2 + vmovups 4*SIZE(BO2), %xmm4 + vmovups 6*SIZE(BO2), %xmm6 + vmovsd 1*SIZE(BO1), %xmm1 + vmovsd 3*SIZE(BO1), %xmm3 + vmovsd 5*SIZE(BO1), %xmm5 + vmovsd 7*SIZE(BO1), %xmm7 + vmovsd %xmm1, 0*SIZE(BO) + vmovups %xmm0, 1*SIZE(BO) + vmovsd %xmm3, 3*SIZE(BO) + vmovups %xmm2, 4*SIZE(BO) + vmovsd %xmm5, 6*SIZE(BO) + vmovups %xmm4, 7*SIZE(BO) + vmovsd %xmm7, 9*SIZE(BO) + vmovups %xmm6,10*SIZE(BO) + addq $ 8*SIZE,BO1 + addq $ 8*SIZE,BO2 + addq $ 12*SIZE,BO + decq %rax + jnz .L6_03 + +.L6_03a: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L6_03c + ALIGN_4 + + +.L6_03b: + + vmovsd 1*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 1*SIZE(BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO2 + addq $ 3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L6_20 + + ALIGN_4 + +.L6_11: + + leaq BUFFER1, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L6_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L6_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L6_12 + +.L6_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + +.L6_13: + + test $2, %rax + jz .L6_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L6_16 + + +.L6_14: + + test $1, %rax + jz .L6_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + + jmp .L6_16 + +.L6_15: + + INIT8x3 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL8x3_SUBN + dec %rax + jne .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE8x3 + + decq I # i -- + jg .L6_11 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $7, M + jz .L7_10 // to next 3 lines of N + + testq $4, M + jz .L6_30 + + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L7_20 + ALIGN_4 + +.L7_11: + + leaq BUFFER2, BO // first buffer to BO + addq $12 * SIZE, BO + movq K, %rax + sarq $3, %rax // K / 8 + cmpq $3, %rax + jl .L7_13 + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + prefetcht0 B_PR1+128(BO) + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + subq $2, %rax + + ALIGN_5 + +.L7_12: + + prefetcht0 B_PR1-24(BO) + prefetcht0 B_PR1+40(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + prefetcht0 B_PR1+104(BO) + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + dec %rax + jne .L7_12 + +.L7_12_E: + + prefetcht0 B_PR1(BO) + prefetcht0 B_PR1+64(BO) + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_13: + + test $2, %rax + jz .L7_14 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_M8 + + KERNEL8x3_M1 + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + +.L7_14: + + test $1, %rax + jz .L7_15 + + KERNEL8x3_INIT + KERNEL8x3_M2 + KERNEL8x3_M3 + KERNEL8x3_M4 + KERNEL8x3_M5 + KERNEL8x3_M6 + KERNEL8x3_M7 + KERNEL8x3_E + + jmp .L7_16 + + + +.L7_15: + + INIT8x3 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + + ALIGN_4 + +.L7_17: + + KERNEL8x3_SUBN + dec %rax + jne .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE8x3 + + decq I # i -- + jg .L7_11 + ALIGN_4 + +.L7_20: + // Test rest of M + + testq $7, M + jz .L7_60 // to next 6 lines of N + + testq $4, M + jz .L7_30 + + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovddup ALPHA, %xmm0 + + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + + +.L7_40: + testq $1, M + jz .L7_60 // to next 6 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovddup ALPHA, %xmm0 + + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + vmovsd %xmm6 , (CO1, LDC, 2) + + + addq $1 * SIZE, CO1 # coffset += 1 + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + + vmovsd %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_0: + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L2_20 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + vmulpd %xmm0, %xmm11,%xmm11 + vmulpd %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + vmovups %xmm11, 4 * SIZE(CO1, LDC) + vmovups %xmm14, 6 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $7, M + jz .L2_60 // to next 2 lines of N + + testq $4, M + jz .L2_30 + + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + + vmulpd %xmm0, %xmm5,%xmm5 + vmulpd %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + vmulsd %xmm0, %xmm5,%xmm5 + +#endif + + vmovsd %xmm4 , (CO1) + vmovsd %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $3, I // i = (m >> 3) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + vmulpd %xmm0, %xmm10,%xmm10 + vmulpd %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + vmovups %xmm10, 4 * SIZE(CO1) + vmovups %xmm13, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + addq $8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $7, M + jz .L999 + + testq $4, M + jz .L1_30 + + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + vmulpd %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulpd %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, 8), AO + leaq (BO, BI, 8), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddsd (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulsd %xmm0, %xmm4,%xmm4 + +#endif + + vmovsd %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, 8), BO + leaq (AO, %rax, 8), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#endif diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c index 74b336f3d7..874ef68d6e 100644 --- a/kernel/x86_64/dgemm_ncopy_8_skylakex.c +++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c @@ -52,18 +52,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __ FLOAT ctemp05, ctemp06, ctemp07, ctemp08; FLOAT ctemp09, ctemp10, ctemp11, ctemp12; FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + FLOAT ctemp17 /*, ctemp18, ctemp19, ctemp20*/ ; + FLOAT /*ctemp21, ctemp22,*/ ctemp23, ctemp24; + FLOAT ctemp25 /*, ctemp26, ctemp27, ctemp28*/ ; + FLOAT /*ctemp29, ctemp30,*/ ctemp31, ctemp32; + FLOAT ctemp33 /*, ctemp34, ctemp35, ctemp36*/ ; + FLOAT /*ctemp37, ctemp38,*/ ctemp39, ctemp40; + FLOAT ctemp41 /*, ctemp42, ctemp43, ctemp44*/ ; + FLOAT /*ctemp45, ctemp46,*/ ctemp47, ctemp48; + FLOAT ctemp49 /*, ctemp50, ctemp51, ctemp52*/ ; + FLOAT /*ctemp53, ctemp54,*/ ctemp55, ctemp56; + FLOAT ctemp57 /*, ctemp58, ctemp59, ctemp60*/ ; + FLOAT /*ctemp61, ctemp62,*/ ctemp63, ctemp64; aoffset = a; diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c index 77ed59a4e3..4551f38a28 100644 --- a/kernel/x86_64/dscal_microk_haswell-2.c +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -38,22 +38,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __asm__ __volatile__ ( - "vmovddup (%2), %%xmm0 \n\t" // alpha + "vbroadcastsd (%2), %%ymm0 \n\t" // alpha "addq $128, %1 \n\t" "cmpq $0, %0 \n\t" "je 4f \n\t" - "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" - "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" - "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" - "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + "vmulpd -128(%1), %%ymm0, %%ymm4 \n\t" + "vmulpd -96(%1), %%ymm0, %%ymm5 \n\t" - "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" - "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" - "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" - "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + "vmulpd -64(%1), %%ymm0, %%ymm6 \n\t" + "vmulpd -32(%1), %%ymm0, %%ymm7 \n\t" "subq $1 , %0 \n\t" "jz 2f \n\t" @@ -62,26 +58,18 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "1: \n\t" // "prefetcht0 640(%1) \n\t" - "vmovups %%xmm4 ,-128(%1) \n\t" - "vmovups %%xmm5 ,-112(%1) \n\t" - "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" - "vmovups %%xmm6 , -96(%1) \n\t" - "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" - "vmovups %%xmm7 , -80(%1) \n\t" - "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + "vmulpd 0(%1), %%ymm0, %%ymm4 \n\t" // "prefetcht0 704(%1) \n\t" - "vmovups %%xmm8 , -64(%1) \n\t" - "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" - "vmovups %%xmm9 , -48(%1) \n\t" - "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" - "vmovups %%xmm10 , -32(%1) \n\t" - "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" - "vmovups %%xmm11 , -16(%1) \n\t" + "vmovups %%ymm6 , -64(%1) \n\t" + "vmulpd 32(%1), %%ymm0, %%ymm5 \n\t" + "vmovups %%ymm7 , -32(%1) \n\t" - "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" - "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + "vmulpd 64(%1), %%ymm0, %%ymm6 \n\t" + "vmulpd 96(%1), %%ymm0, %%ymm7 \n\t" "addq $128, %1 \n\t" @@ -90,15 +78,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "2: \n\t" - "vmovups %%xmm4 ,-128(%1) \n\t" - "vmovups %%xmm5 ,-112(%1) \n\t" - "vmovups %%xmm6 , -96(%1) \n\t" - "vmovups %%xmm7 , -80(%1) \n\t" + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" - "vmovups %%xmm8 , -64(%1) \n\t" - "vmovups %%xmm9 , -48(%1) \n\t" - "vmovups %%xmm10 , -32(%1) \n\t" - "vmovups %%xmm11 , -16(%1) \n\t" + "vmovups %%ymm6 , -64(%1) \n\t" + "vmovups %%ymm7 , -32(%1) \n\t" "addq $128, %1 \n\t" @@ -107,15 +91,11 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "cmpq $8 ,%3 \n\t" "jne 5f \n\t" - "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" - "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" - "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" - "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + "vmulpd -128(%1), %%ymm0, %%ymm4 \n\t" + "vmulpd -96(%1), %%ymm0, %%ymm5 \n\t" - "vmovups %%xmm4 ,-128(%1) \n\t" - "vmovups %%xmm5 ,-112(%1) \n\t" - "vmovups %%xmm6 , -96(%1) \n\t" - "vmovups %%xmm7 , -80(%1) \n\t" + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" "5: \n\t" @@ -149,7 +129,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __asm__ __volatile__ ( - "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + "vxorpd %%ymm0, %%ymm0 , %%ymm0 \n\t" "addq $128, %1 \n\t" @@ -159,15 +139,11 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) ".p2align 4 \n\t" "1: \n\t" - "vmovups %%xmm0 ,-128(%1) \n\t" - "vmovups %%xmm0 ,-112(%1) \n\t" - "vmovups %%xmm0 , -96(%1) \n\t" - "vmovups %%xmm0 , -80(%1) \n\t" + "vmovups %%ymm0 , -128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" - "vmovups %%xmm0 , -64(%1) \n\t" - "vmovups %%xmm0 , -48(%1) \n\t" - "vmovups %%xmm0 , -32(%1) \n\t" - "vmovups %%xmm0 , -16(%1) \n\t" + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" "addq $128, %1 \n\t" "subq $1 , %0 \n\t" @@ -178,10 +154,8 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "cmpq $8 ,%3 \n\t" "jne 4f \n\t" - "vmovups %%xmm0 ,-128(%1) \n\t" - "vmovups %%xmm0 ,-112(%1) \n\t" - "vmovups %%xmm0 , -96(%1) \n\t" - "vmovups %%xmm0 , -80(%1) \n\t" + "vmovups %%ymm0 ,-128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" "4: \n\t" diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c index e695f00c55..b11893f5d6 100644 --- a/kernel/x86_64/omatcopy_rt.c +++ b/kernel/x86_64/omatcopy_rt.c @@ -142,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ } int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ - float *src, *dst, *dst_tmp, *src_base, *dst_base; + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; BLASLONG cols_left, rows_done; float ALPHA = alpha; if(ALPHA==0.0){ diff --git a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S index 9cc27184df..b31a934f23 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S +++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S @@ -1,5231 +1,5231 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -#define KERNEL16x3_1(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_2(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_3(xx) \ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_4(xx) \ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - addq $12, BI ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $64, %rax ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL16x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL16x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovss 0 * SIZE(BO2), %xmm2 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovss 1*SIZE(BO1), %xmm0 - vmovsd 0*SIZE(BO2), %xmm1 - vmovss %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - - vmovss %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - vmulps %xmm0, %xmm11,%xmm11 - vmulps %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - vmulss %xmm0, %xmm5,%xmm5 - vmulss %xmm0, %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulps %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulss %xmm0, %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S index 7c42f1e129..35b01de07b 100644 --- a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S +++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S @@ -1,5258 +1,5258 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2013/10/18 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/29 Saar -* -* Parameter: -* UNROLL_M 16 -* UNROLL_N 2 -* SGEMM_P 768 -* SGEMM_Q 192 -* SGEMM_R 12288 -* A_PR1 384 -* B_PR1 192 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) -* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) -* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) -* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) -* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) -* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) -* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) -* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) -* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) -* -*********************************************************************/ - - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 -#define LB2_OFFSET 4096 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) -#define BUFFER2 LB2_OFFSET+128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - - -#define A_PR1 384 -#define B_PR1 192 - -/******************************************************************************************* -* 3 lines of N -*******************************************************************************************/ - -#define KERNEL16x3_1(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_2(xx) \ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_3(xx) \ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_4(xx) \ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - addq $12, BI ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $64, %rax ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - -#define KERNEL16x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - -#define KERNEL8x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - addq $12, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - nop ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x3_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_2(xx) \ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL4x3_4(xx) \ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x3_SUB(xx) \ - vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -#define KERNEL2x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - addq $12, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x3_1(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_2(xx) \ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -#define KERNEL1x3_4(xx) \ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - addq $12, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x3_SUB(xx) \ - vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ - vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -#define KERNEL16x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - -#define KERNEL16x2_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - addq $8, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - -#define KERNEL8x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - addq $8, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_2(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL4x2_4(xx) \ - vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x2_SUB(xx) \ - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -#define KERNEL2x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_2(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -#define KERNEL1x2_4(xx) \ - vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - addq $8, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x2_SUB(xx) \ - vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ - vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -#define KERNEL16x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_2(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_3(xx) \ - prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - -#define KERNEL16x1_4(xx) \ - prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - addq $4, BI ;\ - addq $64, %rax ;\ - -#define KERNEL16x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ - - -/*******************************************************************************************/ - -#define KERNEL8x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - -#define KERNEL8x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - addq $4, BI ;\ - addq $32, %rax ;\ - -#define KERNEL8x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ - - -/*******************************************************************************************/ - -#define KERNEL4x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_2(xx) \ - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_3(xx) \ - vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL4x1_4(xx) \ - vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $16, %rax ;\ - -#define KERNEL4x1_SUB(xx) \ - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -#define KERNEL2x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - addq $4, BI ;\ - addq $8, %rax ;\ - -#define KERNEL2x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ - -/*******************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_2(xx) \ - vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_3(xx) \ - vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -#define KERNEL1x1_4(xx) \ - vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -#define KERNEL1x1_SUB(xx) \ - vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ - vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC - - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -.L6_01: - // copy to sub buffer - movq K, %rax - salq $1,%rax // K * 2 ; read 2 values - movq B, BO1 - leaq (B,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_01a_2 - ALIGN_4 - -.L6_01a_1: - - prefetcht0 512(BO1) - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovsd 2 * SIZE(BO1), %xmm2 - vmovsd 4 * SIZE(BO1), %xmm4 - vmovsd 6 * SIZE(BO1), %xmm6 - vmovss 0 * SIZE(BO2), %xmm1 - vmovss 2 * SIZE(BO2), %xmm3 - vmovss 4 * SIZE(BO2), %xmm5 - vmovss 6 * SIZE(BO2), %xmm7 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 3*SIZE(BO) - vmovss %xmm3, 5*SIZE(BO) - vmovsd %xmm4, 6*SIZE(BO) - vmovss %xmm5, 8*SIZE(BO) - vmovsd %xmm6, 9*SIZE(BO) - vmovss %xmm7,11*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_01a_1 - - - -.L6_01a_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_02c - ALIGN_4 - - -.L6_02b: - - vmovsd 0 * SIZE(BO1), %xmm0 - vmovss 0 * SIZE(BO2), %xmm2 - vmovsd %xmm0, 0*SIZE(BO) - vmovss %xmm2, 2*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_02b - -.L6_02c: - - movq K, %rax - salq $1,%rax // K * 2 - leaq (B,%rax, SIZE), BO1 // next offset to BO1 - leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 - leaq BUFFER2, BO // second buffer to BO - movq K, %rax - sarq $3 , %rax // K / 8 - jz .L6_02c_2 - ALIGN_4 - -.L6_02c_1: - - prefetcht0 512(BO2) - prefetchw 512(BO) - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - - vmovsd 0 * SIZE(BO2), %xmm0 - vmovsd 2 * SIZE(BO2), %xmm2 - vmovsd 4 * SIZE(BO2), %xmm4 - vmovsd 6 * SIZE(BO2), %xmm6 - vmovss 1 * SIZE(BO1), %xmm1 - vmovss 3 * SIZE(BO1), %xmm3 - vmovss 5 * SIZE(BO1), %xmm5 - vmovss 7 * SIZE(BO1), %xmm7 - vmovss %xmm1, 0*SIZE(BO) - vmovsd %xmm0, 1*SIZE(BO) - vmovss %xmm3, 3*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovss %xmm5, 6*SIZE(BO) - vmovsd %xmm4, 7*SIZE(BO) - vmovss %xmm7, 9*SIZE(BO) - vmovsd %xmm6,10*SIZE(BO) - addq $8*SIZE,BO1 - addq $8*SIZE,BO2 - addq $12*SIZE,BO - - decq %rax - jnz .L6_02c_1 - - -.L6_02c_2: - - movq K, %rax - andq $7, %rax // K % 8 - jz .L6_03c - ALIGN_4 - -.L6_03b: - - vmovss 1*SIZE(BO1), %xmm0 - vmovsd 0*SIZE(BO2), %xmm1 - vmovss %xmm0, 0*SIZE(BO) - vmovsd %xmm1, 1*SIZE(BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO2 - addq $3*SIZE,BO - decq %rax - jnz .L6_03b - - -.L6_03c: - - movq BO2, B // next offset of B - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L6_17 - ALIGN_4 - - -.L6_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L7_10 // to next 3 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L6_27 - ALIGN_4 - - -.L6_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L6_37 - ALIGN_4 - - -.L6_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L7_10 // to next 3 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L6_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L6_47 - ALIGN_4 - - -.L6_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - -/***************************************************************************************************************/ - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), C - leaq (C, LDC, 1), C // c += 3 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - KERNEL16x3_1(xxx) - KERNEL16x3_2(xxx) - KERNEL16x3_3(xxx) - KERNEL16x3_4(xxx) - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_17: - - KERNEL16x3_SUB(xxx) - addq $3, BI - addq $16, %rax - jl .L7_17 - ALIGN_4 - - -.L7_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 - - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) - vmovups %xmm15,12 * SIZE(CO1, LDC, 2) - - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 3 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER2, BO // first buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - KERNEL8x3_1(xxx) - KERNEL8x3_2(xxx) - KERNEL8x3_3(xxx) - KERNEL8x3_4(xxx) - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_20_7: - - KERNEL8x3_SUB(xxx) - addq $3, BI - addq $8, %rax - jl .L7_20_7 - ALIGN_4 - -.L7_20_9: - - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - vmovups %xmm6 , (CO1, LDC, 2) - vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI, SIZE) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI, SIZE) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - KERNEL4x3_1(xxx) - KERNEL4x3_2(xxx) - KERNEL4x3_3(xxx) - KERNEL4x3_4(xxx) - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_27: - - KERNEL4x3_SUB(xxx) - addq $3, BI - addq $4, %rax - jl .L7_27 - ALIGN_4 - - -.L7_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm6 , (CO1, LDC, 2) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - prefetcht0 B_PR1+16(BO,BI,SIZE) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - prefetcht0 B_PR1+32(BO,BI,SIZE) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - KERNEL2x3_1(xxx) - KERNEL2x3_2(xxx) - KERNEL2x3_3(xxx) - KERNEL2x3_4(xxx) - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_37: - - KERNEL2x3_SUB(xxx) - addq $3, BI - addq $2, %rax - jl .L7_37 - ALIGN_4 - - -.L7_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 3 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER2, BO // second buffer to BO - addq $6 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_42: - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - KERNEL1x3_1(xxx) - KERNEL1x3_2(xxx) - KERNEL1x3_3(xxx) - KERNEL1x3_4(xxx) - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,2), BI // BI = BI * 3 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L7_47: - - KERNEL1x3_SUB(xxx) - addq $3, BI - addq $1, %rax - jl .L7_47 - ALIGN_4 - - -.L7_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm6 , (CO1, LDC, 2) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - -.L7_60: - - decq J // j -- - jg .L6_01 - - -.L2_0: - cmpq $0, Nmod6 // N % 6 == 0 - je .L999 - -/************************************************************************************************ -* Loop for Nmod6 / 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - sarq $1, J // j = j / 2 - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - -.L2_60: - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - - vmovups %xmm4 , (CO1) - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - - vmovss %xmm4 , (CO1) - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - KERNEL16x2_1(xxx) - KERNEL16x2_2(xxx) - KERNEL16x2_3(xxx) - KERNEL16x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB(xxx) - addq $2, BI - addq $16, %rax - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 - vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - vmulps %xmm0, %xmm11,%xmm11 - vmulps %xmm0, %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - vmovups %xmm11, 8 * SIZE(CO1, LDC) - vmovups %xmm14,12 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 3 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - KERNEL8x2_1(xxx) - KERNEL8x2_2(xxx) - KERNEL8x2_3(xxx) - KERNEL8x2_4(xxx) - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB(xxx) - addq $2, BI - addq $8, %rax - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - - vmulps %xmm0, %xmm5,%xmm5 - vmulps %xmm0, %xmm8,%xmm8 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - - vmovups %xmm5 , (CO1, LDC) - vmovups %xmm8 , 4 * SIZE(CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - KERNEL4x2_1(xxx) - KERNEL4x2_2(xxx) - KERNEL4x2_3(xxx) - KERNEL4x2_4(xxx) - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB(xxx) - addq $2, BI - addq $4, %rax - jl .L2_27 - ALIGN_4 - - -.L2_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm5,%xmm5 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm5 , (CO1, LDC) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB(xxx) - addq $2, BI - addq $2, %rax - jl .L2_37 - ALIGN_4 - - -.L2_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - vmulss %xmm0, %xmm5,%xmm5 - vmulss %xmm0, %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - vmovss %xmm5 , (CO1, LDC) - vmovss %xmm10, 1 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - addq $2, BI - addq $1, %rax - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $32 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - KERNEL16x1_1(xxx) - KERNEL16x1_2(xxx) - KERNEL16x1_3(xxx) - KERNEL16x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB(xxx) - addq $1, BI - addq $16, %rax - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 - vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - vmulps %xmm0, %xmm10,%xmm10 - vmulps %xmm0, %xmm13,%xmm13 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - vmovups %xmm10, 8 * SIZE(CO1) - vmovups %xmm13,12 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - KERNEL8x1_1(xxx) - KERNEL8x1_2(xxx) - KERNEL8x1_3(xxx) - KERNEL8x1_4(xxx) - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB(xxx) - addq $1, BI - addq $8, %rax - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 - -#else - vmulps %xmm0, %xmm4,%xmm4 - vmulps %xmm0, %xmm7,%xmm7 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm7 , 4 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - prefetcht0 B_PR1(BO,BI, SIZE) - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - KERNEL4x1_1(xxx) - KERNEL4x1_2(xxx) - KERNEL4x1_3(xxx) - KERNEL4x1_4(xxx) - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB(xxx) - addq $1, BI - addq $4, %rax - jl .L1_27 - ALIGN_4 - - -.L1_29: - - vbroadcastss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddps (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulps %xmm0, %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB(xxx) - addq $1, BI - addq $2, %rax - jl .L1_37 - ALIGN_4 - - -.L1_39: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 - -#else - vmulss %xmm0, %xmm4,%xmm4 - vmulss %xmm0, %xmm8,%xmm8 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm8 , 1 * SIZE(CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $2 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - addq $1, BI - addq $1, %rax - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovss ALPHA, %xmm0 - -#ifndef TRMMKERNEL - - vfmaddss (CO1),%xmm0, %xmm4,%xmm4 - -#else - vmulss %xmm0, %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - -#endif +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2013/10/18 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/29 Saar +* +* Parameter: +* UNROLL_M 16 +* UNROLL_N 2 +* SGEMM_P 768 +* SGEMM_Q 192 +* SGEMM_R 12288 +* A_PR1 384 +* B_PR1 192 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) +* 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) +* 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) +* 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) +* 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) +* 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) +* 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) +* 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) +* 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) +* +*********************************************************************/ + + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 +#define LB2_OFFSET 4096 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) +#define BUFFER2 LB2_OFFSET+128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + + +#define A_PR1 384 +#define B_PR1 192 + +/******************************************************************************************* +* 3 lines of N +*******************************************************************************************/ + +#define KERNEL16x3_1(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_2(xx) \ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_3(xx) \ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_4(xx) \ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + addq $12, BI ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $64, %rax ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + +#define KERNEL16x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + +#define KERNEL8x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + addq $12, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + nop ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x3_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_2(xx) \ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL4x3_4(xx) \ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x3_SUB(xx) \ + vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +#define KERNEL2x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + addq $12, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x3_1(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_2(xx) \ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +#define KERNEL1x3_4(xx) \ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + addq $12, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x3_SUB(xx) \ + vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ + vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +#define KERNEL16x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + +#define KERNEL16x2_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + addq $8, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + +#define KERNEL8x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + addq $8, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_2(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL4x2_4(xx) \ + vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x2_SUB(xx) \ + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +#define KERNEL2x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_2(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +#define KERNEL1x2_4(xx) \ + vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + addq $8, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x2_SUB(xx) \ + vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ + vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +#define KERNEL16x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_2(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_3(xx) \ + prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + +#define KERNEL16x1_4(xx) \ + prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + addq $4, BI ;\ + addq $64, %rax ;\ + +#define KERNEL16x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ + + +/*******************************************************************************************/ + +#define KERNEL8x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + +#define KERNEL8x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + addq $4, BI ;\ + addq $32, %rax ;\ + +#define KERNEL8x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ + + +/*******************************************************************************************/ + +#define KERNEL4x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_2(xx) \ + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_3(xx) \ + vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL4x1_4(xx) \ + vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $16, %rax ;\ + +#define KERNEL4x1_SUB(xx) \ + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +#define KERNEL2x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + addq $4, BI ;\ + addq $8, %rax ;\ + +#define KERNEL2x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ + +/*******************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_2(xx) \ + vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_3(xx) \ + vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +#define KERNEL1x1_4(xx) \ + vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +#define KERNEL1x1_SUB(xx) \ + vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ + vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +.L6_01: + // copy to sub buffer + movq K, %rax + salq $1,%rax // K * 2 ; read 2 values + movq B, BO1 + leaq (B,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_01a_2 + ALIGN_4 + +.L6_01a_1: + + prefetcht0 512(BO1) + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovsd 2 * SIZE(BO1), %xmm2 + vmovsd 4 * SIZE(BO1), %xmm4 + vmovsd 6 * SIZE(BO1), %xmm6 + vmovss 0 * SIZE(BO2), %xmm1 + vmovss 2 * SIZE(BO2), %xmm3 + vmovss 4 * SIZE(BO2), %xmm5 + vmovss 6 * SIZE(BO2), %xmm7 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 3*SIZE(BO) + vmovss %xmm3, 5*SIZE(BO) + vmovsd %xmm4, 6*SIZE(BO) + vmovss %xmm5, 8*SIZE(BO) + vmovsd %xmm6, 9*SIZE(BO) + vmovss %xmm7,11*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_01a_1 + + + +.L6_01a_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_02c + ALIGN_4 + + +.L6_02b: + + vmovsd 0 * SIZE(BO1), %xmm0 + vmovss 0 * SIZE(BO2), %xmm2 + vmovsd %xmm0, 0*SIZE(BO) + vmovss %xmm2, 2*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_02b + +.L6_02c: + + movq K, %rax + salq $1,%rax // K * 2 + leaq (B,%rax, SIZE), BO1 // next offset to BO1 + leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 + leaq BUFFER2, BO // second buffer to BO + movq K, %rax + sarq $3 , %rax // K / 8 + jz .L6_02c_2 + ALIGN_4 + +.L6_02c_1: + + prefetcht0 512(BO2) + prefetchw 512(BO) + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + + vmovsd 0 * SIZE(BO2), %xmm0 + vmovsd 2 * SIZE(BO2), %xmm2 + vmovsd 4 * SIZE(BO2), %xmm4 + vmovsd 6 * SIZE(BO2), %xmm6 + vmovss 1 * SIZE(BO1), %xmm1 + vmovss 3 * SIZE(BO1), %xmm3 + vmovss 5 * SIZE(BO1), %xmm5 + vmovss 7 * SIZE(BO1), %xmm7 + vmovss %xmm1, 0*SIZE(BO) + vmovsd %xmm0, 1*SIZE(BO) + vmovss %xmm3, 3*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovss %xmm5, 6*SIZE(BO) + vmovsd %xmm4, 7*SIZE(BO) + vmovss %xmm7, 9*SIZE(BO) + vmovsd %xmm6,10*SIZE(BO) + addq $8*SIZE,BO1 + addq $8*SIZE,BO2 + addq $12*SIZE,BO + + decq %rax + jnz .L6_02c_1 + + +.L6_02c_2: + + movq K, %rax + andq $7, %rax // K % 8 + jz .L6_03c + ALIGN_4 + +.L6_03b: + + vmovss 1*SIZE(BO1), %xmm0 + vmovsd 0*SIZE(BO2), %xmm1 + vmovss %xmm0, 0*SIZE(BO) + vmovsd %xmm1, 1*SIZE(BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO2 + addq $3*SIZE,BO + decq %rax + jnz .L6_03b + + +.L6_03c: + + movq BO2, B // next offset of B + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L6_17 + ALIGN_4 + + +.L6_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L7_10 // to next 3 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L6_27 + ALIGN_4 + + +.L6_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L6_37 + ALIGN_4 + + +.L6_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L7_10 // to next 3 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L6_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L6_47 + ALIGN_4 + + +.L6_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + +/***************************************************************************************************************/ + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), C + leaq (C, LDC, 1), C // c += 3 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + KERNEL16x3_1(xxx) + KERNEL16x3_2(xxx) + KERNEL16x3_3(xxx) + KERNEL16x3_4(xxx) + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_17: + + KERNEL16x3_SUB(xxx) + addq $3, BI + addq $16, %rax + jl .L7_17 + ALIGN_4 + + +.L7_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 + + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) + vmovups %xmm15,12 * SIZE(CO1, LDC, 2) + + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 3 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER2, BO // first buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + KERNEL8x3_1(xxx) + KERNEL8x3_2(xxx) + KERNEL8x3_3(xxx) + KERNEL8x3_4(xxx) + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_20_7: + + KERNEL8x3_SUB(xxx) + addq $3, BI + addq $8, %rax + jl .L7_20_7 + ALIGN_4 + +.L7_20_9: + + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + vmovups %xmm6 , (CO1, LDC, 2) + vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI, SIZE) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI, SIZE) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + KERNEL4x3_1(xxx) + KERNEL4x3_2(xxx) + KERNEL4x3_3(xxx) + KERNEL4x3_4(xxx) + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_27: + + KERNEL4x3_SUB(xxx) + addq $3, BI + addq $4, %rax + jl .L7_27 + ALIGN_4 + + +.L7_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm6 , (CO1, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + prefetcht0 B_PR1+16(BO,BI,SIZE) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + prefetcht0 B_PR1+32(BO,BI,SIZE) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + KERNEL2x3_1(xxx) + KERNEL2x3_2(xxx) + KERNEL2x3_3(xxx) + KERNEL2x3_4(xxx) + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_37: + + KERNEL2x3_SUB(xxx) + addq $3, BI + addq $2, %rax + jl .L7_37 + ALIGN_4 + + +.L7_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 3 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER2, BO // second buffer to BO + addq $6 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_42: + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + KERNEL1x3_1(xxx) + KERNEL1x3_2(xxx) + KERNEL1x3_3(xxx) + KERNEL1x3_4(xxx) + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,2), BI // BI = BI * 3 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L7_47: + + KERNEL1x3_SUB(xxx) + addq $3, BI + addq $1, %rax + jl .L7_47 + ALIGN_4 + + +.L7_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm6 , (CO1, LDC, 2) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + +.L7_60: + + decq J // j -- + jg .L6_01 + + +.L2_0: + cmpq $0, Nmod6 // N % 6 == 0 + je .L999 + +/************************************************************************************************ +* Loop for Nmod6 / 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + sarq $1, J // j = j / 2 + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + +.L2_60: + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + + vmovups %xmm4 , (CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + + vmovss %xmm4 , (CO1) + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + KERNEL16x2_1(xxx) + KERNEL16x2_2(xxx) + KERNEL16x2_3(xxx) + KERNEL16x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB(xxx) + addq $2, BI + addq $16, %rax + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 + vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + vmulps %xmm0, %xmm11,%xmm11 + vmulps %xmm0, %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + vmovups %xmm11, 8 * SIZE(CO1, LDC) + vmovups %xmm14,12 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 3 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + KERNEL8x2_1(xxx) + KERNEL8x2_2(xxx) + KERNEL8x2_3(xxx) + KERNEL8x2_4(xxx) + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB(xxx) + addq $2, BI + addq $8, %rax + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + + vmulps %xmm0, %xmm5,%xmm5 + vmulps %xmm0, %xmm8,%xmm8 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + + vmovups %xmm5 , (CO1, LDC) + vmovups %xmm8 , 4 * SIZE(CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + KERNEL4x2_1(xxx) + KERNEL4x2_2(xxx) + KERNEL4x2_3(xxx) + KERNEL4x2_4(xxx) + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB(xxx) + addq $2, BI + addq $4, %rax + jl .L2_27 + ALIGN_4 + + +.L2_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm5,%xmm5 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm5 , (CO1, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB(xxx) + addq $2, BI + addq $2, %rax + jl .L2_37 + ALIGN_4 + + +.L2_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + vmulss %xmm0, %xmm5,%xmm5 + vmulss %xmm0, %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + vmovss %xmm5 , (CO1, LDC) + vmovss %xmm10, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + addq $2, BI + addq $1, %rax + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $32 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + KERNEL16x1_1(xxx) + KERNEL16x1_2(xxx) + KERNEL16x1_3(xxx) + KERNEL16x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB(xxx) + addq $1, BI + addq $16, %rax + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 + vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + vmulps %xmm0, %xmm10,%xmm10 + vmulps %xmm0, %xmm13,%xmm13 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + vmovups %xmm10, 8 * SIZE(CO1) + vmovups %xmm13,12 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + KERNEL8x1_1(xxx) + KERNEL8x1_2(xxx) + KERNEL8x1_3(xxx) + KERNEL8x1_4(xxx) + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB(xxx) + addq $1, BI + addq $8, %rax + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 + +#else + vmulps %xmm0, %xmm4,%xmm4 + vmulps %xmm0, %xmm7,%xmm7 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm7 , 4 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + prefetcht0 B_PR1(BO,BI, SIZE) + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + KERNEL4x1_1(xxx) + KERNEL4x1_2(xxx) + KERNEL4x1_3(xxx) + KERNEL4x1_4(xxx) + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB(xxx) + addq $1, BI + addq $4, %rax + jl .L1_27 + ALIGN_4 + + +.L1_29: + + vbroadcastss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddps (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulps %xmm0, %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB(xxx) + addq $1, BI + addq $2, %rax + jl .L1_37 + ALIGN_4 + + +.L1_39: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 + +#else + vmulss %xmm0, %xmm4,%xmm4 + vmulss %xmm0, %xmm8,%xmm8 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm8 , 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $2 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + addq $1, BI + addq $1, %rax + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovss ALPHA, %xmm0 + +#ifndef TRMMKERNEL + + vfmaddss (CO1),%xmm0, %xmm4,%xmm4 + +#else + vmulss %xmm0, %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S index ef156fd279..76ea12fee7 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S +++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S @@ -1,6806 +1,6806 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/********************************************************************* -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* SGEMM_DEFAULT_UNROLL_N 4 -* SGEMM_DEFAULT_UNROLL_M 16 -* SGEMM_DEFAULT_P 768 -* SGEMM_DEFAULT_Q 384 -* A_PR1 512 -* B_PR1 512 -* -* -* 2014/07/28 Saar -* Performance at 9216x9216x9216: -* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) -* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) -* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) -* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) -* -*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define BO2 %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#if defined(OS_WINDOWS) -#define L_BUFFER_SIZE 8192 -#else -#define L_BUFFER_SIZE 12288 -#endif - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#if defined(BULLDOZER) - -#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 - -#else - -#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 - -#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 - -#endif - - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 6 lines of N -*******************************************************************************************/ - -.macro KERNEL16x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vmovups -8 * SIZE(AO), %ymm1 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - prefetcht0 A_PR1(AO) - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 16*SIZE, AO - decq %rax -.endm - -.macro SAVE16x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm13, %ymm13 - vmulps %ymm0 , %ymm14, %ymm14 - vmulps %ymm0 , %ymm15, %ymm15 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 - - vaddps (CO2), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2), %ymm11,%ymm11 - - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 - - vaddps (CO2, LDC,2), %ymm14,%ymm14 - vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) - - vmovups %ymm10, (CO2) - vmovups %ymm11, 8 * SIZE(CO2) - - vmovups %ymm12, (CO2, LDC) - vmovups %ymm13, 8 * SIZE(CO2, LDC) - - vmovups %ymm14, (CO2, LDC,2) - vmovups %ymm15, 8 * SIZE(CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x6_SUB - vmovups -16 * SIZE(AO), %ymm0 - vbroadcastss -4 * SIZE(BO), %ymm2 - vbroadcastss -3 * SIZE(BO), %ymm3 - - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - - vbroadcastss -2 * SIZE(BO), %ymm2 - vbroadcastss -1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - - vbroadcastss 0 * SIZE(BO), %ymm2 - vbroadcastss 1 * SIZE(BO), %ymm3 - VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE8x6 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm12, %ymm12 - vmulps %ymm0 , %ymm14, %ymm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO1, LDC,2), %ymm8,%ymm8 - vaddps (CO2), %ymm10,%ymm10 - vaddps (CO2, LDC), %ymm12,%ymm12 - vaddps (CO2, LDC,2), %ymm14,%ymm14 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO1, LDC,2) - vmovups %ymm10, (CO2) - vmovups %ymm12, (CO2, LDC) - vmovups %ymm14, (CO2, LDC,2) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x6_SUB - vmovups -16 * SIZE(AO), %xmm0 - vbroadcastss -4 * SIZE(BO), %xmm2 - vbroadcastss -3 * SIZE(BO), %xmm3 - - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - - vbroadcastss -2 * SIZE(BO), %xmm2 - vbroadcastss -1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - - vbroadcastss 0 * SIZE(BO), %xmm2 - vbroadcastss 1 * SIZE(BO), %xmm3 - VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE4x6 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - vmulps %xmm0 , %xmm12, %xmm12 - vmulps %xmm0 , %xmm14, %xmm14 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO1, LDC,2), %xmm8,%xmm8 - vaddps (CO2), %xmm10,%xmm10 - vaddps (CO2, LDC), %xmm12,%xmm12 - vaddps (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO1, LDC,2) - vmovups %xmm10, (CO2) - vmovups %xmm12, (CO2, LDC) - vmovups %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -15 * SIZE(AO), %xmm1 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE2x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm13, %xmm13 - vmulss %xmm0 , %xmm14, %xmm14 - vmulss %xmm0 , %xmm15, %xmm15 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 - - vaddss (CO2), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2), %xmm11,%xmm11 - - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 - - vaddss (CO2, LDC,2), %xmm14,%xmm14 - vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) - - vmovss %xmm10, (CO2) - vmovss %xmm11, 1 * SIZE(CO2) - - vmovss %xmm12, (CO2, LDC) - vmovss %xmm13, 1 * SIZE(CO2, LDC) - - vmovss %xmm14, (CO2, LDC,2) - vmovss %xmm15, 1 * SIZE(CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x6_SUB - vmovss -16 * SIZE(AO), %xmm0 - vmovss -4 * SIZE(BO), %xmm2 - vmovss -3 * SIZE(BO), %xmm3 - - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - - vmovss -2 * SIZE(BO), %xmm2 - vmovss -1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - - vmovss 0 * SIZE(BO), %xmm2 - vmovss 1 * SIZE(BO), %xmm3 - VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 1*SIZE, AO - decq %rax -.endm - -.macro SAVE1x6 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm12, %xmm12 - vmulss %xmm0 , %xmm14, %xmm14 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO1, LDC,2), %xmm8,%xmm8 - vaddss (CO2), %xmm10,%xmm10 - vaddss (CO2, LDC), %xmm12,%xmm12 - vaddss (CO2, LDC,2), %xmm14,%xmm14 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO1, LDC,2) - vmovss %xmm10, (CO2) - vmovss %xmm12, (CO2, LDC) - vmovss %xmm14, (CO2, LDC,2) - -.endm - - -/*******************************************************************************************/ - - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -#if !defined(TRMMKERNEL) - -/************************************************************************************* -* GEMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $12, %rdi - divq %rdi // N / 12 - movq %rax, Ndiv6 // N / 12 - movq %rdx, Nmod6 // N % 12 - - movq Ndiv6, J - cmpq $0, J - je .L4_00 - ALIGN_4 - - -/*******************************************************************************************/ - -.L6_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq BO2, B // next offset of B - movq K, %rax - - ALIGN_4 - - -.L6_02c: - - vmovups (BO1), %xmm0 - vmovsd (BO2), %xmm1 - vmovups %xmm0, (BO) - vmovsd %xmm1, 4*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_02c - - -.L6_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L6_20 - - ALIGN_4 - -.L6_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L6_16 - - ALIGN_4 - -.L6_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L6_16 - - jmp .L6_12 - ALIGN_4 - -.L6_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_19 - - ALIGN_4 - -.L6_17: - - KERNEL16x6_SUB - - jnz .L6_17 - ALIGN_4 - - -.L6_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L6_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_20: - // Test rest of M - - testq $15, M - jz .L6_60 // to next 6 lines of N - - testq $8, M - jz .L6_21pre - ALIGN_4 - -/**************************************************************************/ - -.L6_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_20_6 - - ALIGN_4 - -.L6_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L6_20_6 - - jmp .L6_20_2 - ALIGN_4 - -.L6_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_20_9 - - ALIGN_4 - -.L6_20_7: - - KERNEL8x6_SUB - - jnz .L6_20_7 - ALIGN_4 - - -.L6_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L6_21pre: - - testq $4, M - jz .L6_30 - ALIGN_4 - -.L6_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_26 - - ALIGN_4 - -.L6_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L6_26 - - jmp .L6_22 - ALIGN_4 - -.L6_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_29 - - ALIGN_4 - -.L6_27: - - KERNEL4x6_SUB - - jnz .L6_27 - ALIGN_4 - - -.L6_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L6_30: - testq $2, M - jz .L6_40 - - ALIGN_4 - -.L6_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_36 - - ALIGN_4 - -.L6_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L6_36 - - jmp .L6_32 - ALIGN_4 - -.L6_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_39 - - ALIGN_4 - -.L6_37: - - KERNEL2x6_SUB - - jnz .L6_37 - ALIGN_4 - - -.L6_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L6_40: - testq $1, M - jz .L6_60 // to next 4 lines of N - - ALIGN_4 - -.L6_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L6_46 - - ALIGN_4 - -.L6_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L6_46 - - jmp .L6_42 - ALIGN_4 - -.L6_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L6_49 - - ALIGN_4 - -.L6_47: - - KERNEL1x6_SUB - - jnz .L6_47 - ALIGN_4 - - -.L6_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L6_60: - - -/*******************************************************************************************/ - - -.L7_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 4 values of B - leaq (B, %rax,4), BO2 - movq K, %rax - - ALIGN_4 - - -.L7_02c: - - vmovsd 2*SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovsd %xmm0, (BO) - vmovups %xmm1, 2*SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_02c - - movq BO2, B // next offset of B - -.L7_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc - leaq (C, LDC, 4), C - leaq (C, LDC, 2), C // c = c + 6 * ldc - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L7_20 - - ALIGN_4 - -.L7_11: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax // K = K - ( K % 8 ) - je .L7_16 - - ALIGN_4 - -.L7_12: - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - KERNEL16x6_SUB - - je .L7_16 - - jmp .L7_12 - ALIGN_4 - -.L7_16: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_19 - - ALIGN_4 - -.L7_17: - - KERNEL16x6_SUB - - jnz .L7_17 - ALIGN_4 - - -.L7_19: - - SAVE16x6 - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L7_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_20: - // Test rest of M - - testq $15, M - jz .L7_60 // to next 6 lines of N - - testq $8, M - jz .L7_21pre - ALIGN_4 - -/**************************************************************************/ - -.L7_20_1: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_20_6 - - ALIGN_4 - -.L7_20_2: - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - prefetcht0 A_PR1(AO) - KERNEL8x6_SUB - KERNEL8x6_SUB - - je .L7_20_6 - - jmp .L7_20_2 - ALIGN_4 - -.L7_20_6: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_20_9 - - ALIGN_4 - -.L7_20_7: - - KERNEL8x6_SUB - - jnz .L7_20_7 - ALIGN_4 - - -.L7_20_9: - - SAVE8x6 - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L7_21pre: - - testq $4, M - jz .L7_30 - ALIGN_4 - -.L7_21: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_26 - - ALIGN_4 - -.L7_22: - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - prefetcht0 A_PR1(AO) - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - KERNEL4x6_SUB - - je .L7_26 - - jmp .L7_22 - ALIGN_4 - -.L7_26: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_29 - - ALIGN_4 - -.L7_27: - - KERNEL4x6_SUB - - jnz .L7_27 - ALIGN_4 - - -.L7_29: - - SAVE4x6 - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L7_30: - testq $2, M - jz .L7_40 - - ALIGN_4 - -.L7_31: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_36 - - ALIGN_4 - -.L7_32: - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - prefetcht0 A_PR1(AO) - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - KERNEL2x6_SUB - - je .L7_36 - - jmp .L7_32 - ALIGN_4 - -.L7_36: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_39 - - ALIGN_4 - -.L7_37: - - KERNEL2x6_SUB - - jnz .L7_37 - ALIGN_4 - - -.L7_39: - - SAVE2x6 - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L7_40: - testq $1, M - jz .L7_60 // to next 4 lines of N - - ALIGN_4 - -.L7_41: - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - - vzeroall - - movq K, %rax - - andq $-8, %rax - je .L7_46 - - ALIGN_4 - -.L7_42: - - prefetcht0 A_PR1(AO) - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - KERNEL1x6_SUB - - je .L7_46 - - jmp .L7_42 - ALIGN_4 - -.L7_46: - movq K, %rax - - andq $7, %rax # if (k & 1) - je .L7_49 - - ALIGN_4 - -.L7_47: - - KERNEL1x6_SUB - - jnz .L7_47 - ALIGN_4 - - -.L7_49: - - SAVE1x6 - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L7_60: - - decq J // j -- - jg .L6_01 // next 12 lines of N - - - - -/*******************************************************************************************/ -.L4_00: - - movq Nmod6, J - sarq $2, J // j = j / 4 - cmpq $ 0, J - je .L2_00 - ALIGN_4 - - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_00: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - -#else - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - -#endif - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/********************************************************************* +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 4 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 768 +* SGEMM_DEFAULT_Q 384 +* A_PR1 512 +* B_PR1 512 +* +* +* 2014/07/28 Saar +* Performance at 9216x9216x9216: +* 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83) +* 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155) +* 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230) +* 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267) +* +*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define BO2 %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#if defined(OS_WINDOWS) +#define L_BUFFER_SIZE 8192 +#else +#define L_BUFFER_SIZE 12288 +#endif + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#if defined(BULLDOZER) + +#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0 + +#else + +#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0 + +#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0 + +#endif + + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 6 lines of N +*******************************************************************************************/ + +.macro KERNEL16x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vmovups -8 * SIZE(AO), %ymm1 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + prefetcht0 A_PR1(AO) + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm13,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm15,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 16*SIZE, AO + decq %rax +.endm + +.macro SAVE16x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm13, %ymm13 + vmulps %ymm0 , %ymm14, %ymm14 + vmulps %ymm0 , %ymm15, %ymm15 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO1, LDC,2), %ymm9,%ymm9 + + vaddps (CO2), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2), %ymm11,%ymm11 + + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps 8 * SIZE(CO2, LDC), %ymm13,%ymm13 + + vaddps (CO2, LDC,2), %ymm14,%ymm14 + vaddps 8 * SIZE(CO2, LDC,2), %ymm15,%ymm15 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm9 , 8 * SIZE(CO1, LDC,2) + + vmovups %ymm10, (CO2) + vmovups %ymm11, 8 * SIZE(CO2) + + vmovups %ymm12, (CO2, LDC) + vmovups %ymm13, 8 * SIZE(CO2, LDC) + + vmovups %ymm14, (CO2, LDC,2) + vmovups %ymm15, 8 * SIZE(CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x6_SUB + vmovups -16 * SIZE(AO), %ymm0 + vbroadcastss -4 * SIZE(BO), %ymm2 + vbroadcastss -3 * SIZE(BO), %ymm3 + + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + + vbroadcastss -2 * SIZE(BO), %ymm2 + vbroadcastss -1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + + vbroadcastss 0 * SIZE(BO), %ymm2 + vbroadcastss 1 * SIZE(BO), %ymm3 + VFMADD231PS_( %ymm12,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm14,%ymm3,%ymm0 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE8x6 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm12, %ymm12 + vmulps %ymm0 , %ymm14, %ymm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO1, LDC,2), %ymm8,%ymm8 + vaddps (CO2), %ymm10,%ymm10 + vaddps (CO2, LDC), %ymm12,%ymm12 + vaddps (CO2, LDC,2), %ymm14,%ymm14 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO1, LDC,2) + vmovups %ymm10, (CO2) + vmovups %ymm12, (CO2, LDC) + vmovups %ymm14, (CO2, LDC,2) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x6_SUB + vmovups -16 * SIZE(AO), %xmm0 + vbroadcastss -4 * SIZE(BO), %xmm2 + vbroadcastss -3 * SIZE(BO), %xmm3 + + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + + vbroadcastss -2 * SIZE(BO), %xmm2 + vbroadcastss -1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + + vbroadcastss 0 * SIZE(BO), %xmm2 + vbroadcastss 1 * SIZE(BO), %xmm3 + VFMADD231PS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE4x6 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + vmulps %xmm0 , %xmm12, %xmm12 + vmulps %xmm0 , %xmm14, %xmm14 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO1, LDC,2), %xmm8,%xmm8 + vaddps (CO2), %xmm10,%xmm10 + vaddps (CO2, LDC), %xmm12,%xmm12 + vaddps (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO1, LDC,2) + vmovups %xmm10, (CO2) + vmovups %xmm12, (CO2, LDC) + vmovups %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -15 * SIZE(AO), %xmm1 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm13,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm15,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE2x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm13, %xmm13 + vmulss %xmm0 , %xmm14, %xmm14 + vmulss %xmm0 , %xmm15, %xmm15 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9 + + vaddss (CO2), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2), %xmm11,%xmm11 + + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13 + + vaddss (CO2, LDC,2), %xmm14,%xmm14 + vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm9 , 1 * SIZE(CO1, LDC,2) + + vmovss %xmm10, (CO2) + vmovss %xmm11, 1 * SIZE(CO2) + + vmovss %xmm12, (CO2, LDC) + vmovss %xmm13, 1 * SIZE(CO2, LDC) + + vmovss %xmm14, (CO2, LDC,2) + vmovss %xmm15, 1 * SIZE(CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x6_SUB + vmovss -16 * SIZE(AO), %xmm0 + vmovss -4 * SIZE(BO), %xmm2 + vmovss -3 * SIZE(BO), %xmm3 + + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + + vmovss -2 * SIZE(BO), %xmm2 + vmovss -1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + + vmovss 0 * SIZE(BO), %xmm2 + vmovss 1 * SIZE(BO), %xmm3 + VFMADD231SS_( %xmm12,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm14,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 1*SIZE, AO + decq %rax +.endm + +.macro SAVE1x6 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm12, %xmm12 + vmulss %xmm0 , %xmm14, %xmm14 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO1, LDC,2), %xmm8,%xmm8 + vaddss (CO2), %xmm10,%xmm10 + vaddss (CO2, LDC), %xmm12,%xmm12 + vaddss (CO2, LDC,2), %xmm14,%xmm14 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO1, LDC,2) + vmovss %xmm10, (CO2) + vmovss %xmm12, (CO2, LDC) + vmovss %xmm14, (CO2, LDC,2) + +.endm + + +/*******************************************************************************************/ + + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +#if !defined(TRMMKERNEL) + +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $12, %rdi + divq %rdi // N / 12 + movq %rax, Ndiv6 // N / 12 + movq %rdx, Nmod6 // N % 12 + + movq Ndiv6, J + cmpq $0, J + je .L4_00 + ALIGN_4 + + +/*******************************************************************************************/ + +.L6_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq BO2, B // next offset of B + movq K, %rax + + ALIGN_4 + + +.L6_02c: + + vmovups (BO1), %xmm0 + vmovsd (BO2), %xmm1 + vmovups %xmm0, (BO) + vmovsd %xmm1, 4*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_02c + + +.L6_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L6_20 + + ALIGN_4 + +.L6_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L6_16 + + ALIGN_4 + +.L6_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L6_16 + + jmp .L6_12 + ALIGN_4 + +.L6_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_19 + + ALIGN_4 + +.L6_17: + + KERNEL16x6_SUB + + jnz .L6_17 + ALIGN_4 + + +.L6_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L6_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_20: + // Test rest of M + + testq $15, M + jz .L6_60 // to next 6 lines of N + + testq $8, M + jz .L6_21pre + ALIGN_4 + +/**************************************************************************/ + +.L6_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_20_6 + + ALIGN_4 + +.L6_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L6_20_6 + + jmp .L6_20_2 + ALIGN_4 + +.L6_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_20_9 + + ALIGN_4 + +.L6_20_7: + + KERNEL8x6_SUB + + jnz .L6_20_7 + ALIGN_4 + + +.L6_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L6_21pre: + + testq $4, M + jz .L6_30 + ALIGN_4 + +.L6_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_26 + + ALIGN_4 + +.L6_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L6_26 + + jmp .L6_22 + ALIGN_4 + +.L6_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_29 + + ALIGN_4 + +.L6_27: + + KERNEL4x6_SUB + + jnz .L6_27 + ALIGN_4 + + +.L6_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L6_30: + testq $2, M + jz .L6_40 + + ALIGN_4 + +.L6_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_36 + + ALIGN_4 + +.L6_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L6_36 + + jmp .L6_32 + ALIGN_4 + +.L6_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_39 + + ALIGN_4 + +.L6_37: + + KERNEL2x6_SUB + + jnz .L6_37 + ALIGN_4 + + +.L6_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L6_40: + testq $1, M + jz .L6_60 // to next 4 lines of N + + ALIGN_4 + +.L6_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L6_46 + + ALIGN_4 + +.L6_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L6_46 + + jmp .L6_42 + ALIGN_4 + +.L6_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L6_49 + + ALIGN_4 + +.L6_47: + + KERNEL1x6_SUB + + jnz .L6_47 + ALIGN_4 + + +.L6_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L6_60: + + +/*******************************************************************************************/ + + +.L7_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 4 values of B + leaq (B, %rax,4), BO2 + movq K, %rax + + ALIGN_4 + + +.L7_02c: + + vmovsd 2*SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovsd %xmm0, (BO) + vmovups %xmm1, 2*SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_02c + + movq BO2, B // next offset of B + +.L7_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc + leaq (C, LDC, 4), C + leaq (C, LDC, 2), C // c = c + 6 * ldc + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L7_20 + + ALIGN_4 + +.L7_11: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax // K = K - ( K % 8 ) + je .L7_16 + + ALIGN_4 + +.L7_12: + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + KERNEL16x6_SUB + + je .L7_16 + + jmp .L7_12 + ALIGN_4 + +.L7_16: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_19 + + ALIGN_4 + +.L7_17: + + KERNEL16x6_SUB + + jnz .L7_17 + ALIGN_4 + + +.L7_19: + + SAVE16x6 + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L7_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_20: + // Test rest of M + + testq $15, M + jz .L7_60 // to next 6 lines of N + + testq $8, M + jz .L7_21pre + ALIGN_4 + +/**************************************************************************/ + +.L7_20_1: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_20_6 + + ALIGN_4 + +.L7_20_2: + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + prefetcht0 A_PR1(AO) + KERNEL8x6_SUB + KERNEL8x6_SUB + + je .L7_20_6 + + jmp .L7_20_2 + ALIGN_4 + +.L7_20_6: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_20_9 + + ALIGN_4 + +.L7_20_7: + + KERNEL8x6_SUB + + jnz .L7_20_7 + ALIGN_4 + + +.L7_20_9: + + SAVE8x6 + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L7_21pre: + + testq $4, M + jz .L7_30 + ALIGN_4 + +.L7_21: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_26 + + ALIGN_4 + +.L7_22: + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + prefetcht0 A_PR1(AO) + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + KERNEL4x6_SUB + + je .L7_26 + + jmp .L7_22 + ALIGN_4 + +.L7_26: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_29 + + ALIGN_4 + +.L7_27: + + KERNEL4x6_SUB + + jnz .L7_27 + ALIGN_4 + + +.L7_29: + + SAVE4x6 + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L7_30: + testq $2, M + jz .L7_40 + + ALIGN_4 + +.L7_31: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_36 + + ALIGN_4 + +.L7_32: + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + prefetcht0 A_PR1(AO) + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + KERNEL2x6_SUB + + je .L7_36 + + jmp .L7_32 + ALIGN_4 + +.L7_36: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_39 + + ALIGN_4 + +.L7_37: + + KERNEL2x6_SUB + + jnz .L7_37 + ALIGN_4 + + +.L7_39: + + SAVE2x6 + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L7_40: + testq $1, M + jz .L7_60 // to next 4 lines of N + + ALIGN_4 + +.L7_41: + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + + vzeroall + + movq K, %rax + + andq $-8, %rax + je .L7_46 + + ALIGN_4 + +.L7_42: + + prefetcht0 A_PR1(AO) + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + KERNEL1x6_SUB + + je .L7_46 + + jmp .L7_42 + ALIGN_4 + +.L7_46: + movq K, %rax + + andq $7, %rax # if (k & 1) + je .L7_49 + + ALIGN_4 + +.L7_47: + + KERNEL1x6_SUB + + jnz .L7_47 + ALIGN_4 + + +.L7_49: + + SAVE1x6 + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L7_60: + + decq J // j -- + jg .L6_01 // next 12 lines of N + + + + +/*******************************************************************************************/ +.L4_00: + + movq Nmod6, J + sarq $2, J // j = j / 4 + cmpq $ 0, J + je .L2_00 + ALIGN_4 + + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_00: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + +#else + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + +#endif + diff --git a/kernel/x86_64/sgemm_kernel_16x4_sandy.S b/kernel/x86_64/sgemm_kernel_16x4_sandy.S index ea15cd87ee..2ee4b15548 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_sandy.S +++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S @@ -1,3167 +1,3167 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define CO2 %rdx - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 256 - -#define OLD_A 40 + STACKSIZE(%rsp) -#define OLD_B 48 + STACKSIZE(%rsp) -#define OLD_C 56 + STACKSIZE(%rsp) -#define OLD_LDC 64 + STACKSIZE(%rsp) -#define OLD_OFFSET 72 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA 48(%rsp) -#define OFFSET 56(%rsp) -#define KK 64(%rsp) -#define KKK 72(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - -#define A_PR1 512 -#define B_PR1 512 - -/******************************************************************************************* -* 4 lines of N -*******************************************************************************************/ - -.macro KERNEL16x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - vaddps %ymm14, %ymm6 , %ymm6 - vaddps %ymm15, %ymm7 , %ymm7 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm8 , %ymm8 - vaddps %ymm13, %ymm9 , %ymm9 - vaddps %ymm14, %ymm10, %ymm10 - vaddps %ymm15, %ymm11, %ymm11 - addq $ 4 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm9 , %ymm9 - vmulps %ymm0 , %ymm10, %ymm10 - vmulps %ymm0 , %ymm11, %ymm11 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - - vaddps (CO2), %ymm8,%ymm8 - vaddps 8 * SIZE(CO2), %ymm9,%ymm9 - - vaddps (CO2, LDC), %ymm10,%ymm10 - vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - - vmovups %ymm8 , (CO2) - vmovups %ymm9 , 8 * SIZE(CO2) - - vmovups %ymm10, (CO2, LDC) - vmovups %ymm11, 8 * SIZE(CO2, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - prefetcht0 64(CO2) - prefetcht0 64(CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm14, %ymm6 , %ymm6 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm8 , %ymm8 - vaddps %ymm14, %ymm10, %ymm10 - addq $ 4 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x4 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm8 , %ymm8 - vmulps %ymm0 , %ymm10, %ymm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps (CO2), %ymm8,%ymm8 - vaddps (CO2, LDC), %ymm10,%ymm10 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm8 , (CO2) - vmovups %ymm10, (CO2, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x4_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm4 , %xmm4 - vaddps %xmm14, %xmm6 , %xmm6 - vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm8 , %xmm8 - vaddps %xmm14, %xmm10, %xmm10 - addq $ 4 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x4 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - vmulps %xmm0 , %xmm8 , %xmm8 - vmulps %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - vaddps (CO2), %xmm8,%xmm8 - vaddps (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - vmovups %xmm8 , (CO2) - vmovups %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - vaddss %xmm14, %xmm6 , %xmm6 - vaddss %xmm15, %xmm7 , %xmm7 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm8 , %xmm8 - vaddss %xmm13, %xmm9 , %xmm9 - vaddss %xmm14, %xmm10, %xmm10 - vaddss %xmm15, %xmm11, %xmm11 - addq $ 4 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm9 , %xmm9 - vmulss %xmm0 , %xmm10, %xmm10 - vmulss %xmm0 , %xmm11, %xmm11 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - - vaddss (CO2), %xmm8,%xmm8 - vaddss 1 * SIZE(CO2), %xmm9,%xmm9 - - vaddss (CO2, LDC), %xmm10,%xmm10 - vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - - vmovss %xmm8 , (CO2) - vmovss %xmm9 , 1 * SIZE(CO2) - - vmovss %xmm10, (CO2, LDC) - vmovss %xmm11, 1 * SIZE(CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x4_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm14, %xmm6 , %xmm6 - vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm8 , %xmm8 - vaddss %xmm14, %xmm10, %xmm10 - addq $ 4 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x4 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm8 , %xmm8 - vmulss %xmm0 , %xmm10, %xmm10 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss (CO2), %xmm8,%xmm8 - vaddss (CO2, LDC), %xmm10,%xmm10 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm8 , (CO2) - vmovss %xmm10, (CO2, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 2 lines of N -*******************************************************************************************/ - -.macro KERNEL16x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vmulps %ymm3 , %ymm0 , %ymm14 - vmulps %ymm3 , %ymm1 , %ymm15 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - vaddps %ymm14, %ymm6 , %ymm6 - vaddps %ymm15, %ymm7 , %ymm7 - addq $ 2 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - vmulps %ymm0 , %ymm6 , %ymm6 - vmulps %ymm0 , %ymm7 , %ymm7 - - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - - vaddps (CO1, LDC), %ymm6,%ymm6 - vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - - vmovups %ymm6 , (CO1, LDC) - vmovups %ymm7 , 8 * SIZE(CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL8x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm3 , %ymm0 , %ymm14 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm14, %ymm6 , %ymm6 - addq $ 2 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x2 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm6 , %ymm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps (CO1, LDC), %ymm6,%ymm6 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm6 , (CO1, LDC) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulps %xmm2 , %xmm0 , %xmm12 - vmulps %xmm3 , %xmm0 , %xmm14 - vaddps %xmm12, %xmm4 , %xmm4 - vaddps %xmm14, %xmm6 , %xmm6 - addq $ 2 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x2 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - vmulps %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - vaddps (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovups %xmm4 , (CO1) - vmovups %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vmulss %xmm3 , %xmm0 , %xmm14 - vmulss %xmm3 , %xmm1 , %xmm15 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - vaddss %xmm14, %xmm6 , %xmm6 - vaddss %xmm15, %xmm7 , %xmm7 - addq $ 2 , BI - addq $ 2, %rax -.endm - -.macro SAVE2x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - vmulss %xmm0 , %xmm6 , %xmm6 - vmulss %xmm0 , %xmm7 , %xmm7 - - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - - vaddss (CO1, LDC), %xmm6,%xmm6 - vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - - vmovss %xmm6 , (CO1, LDC) - vmovss %xmm7 , 1 * SIZE(CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x2_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm3 , %xmm0 , %xmm14 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm14, %xmm6 , %xmm6 - addq $ 2 , BI - addq $ 1, %rax -.endm - -.macro SAVE1x2 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm6 , %xmm6 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss (CO1, LDC), %xmm6,%xmm6 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm6 , (CO1, LDC) - -.endm - - -/*******************************************************************************************/ - -/******************************************************************************************* -* 1 line of N -*******************************************************************************************/ - -.macro KERNEL16x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vmulps %ymm2 , %ymm0 , %ymm12 - vmulps %ymm2 , %ymm1 , %ymm13 - vaddps %ymm12, %ymm4 , %ymm4 - vaddps %ymm13, %ymm5 , %ymm5 - addq $ 1 , BI - addq $ 16, %rax -.endm - -.macro SAVE16x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - vmulps %ymm0 , %ymm5 , %ymm5 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - vaddps 8 * SIZE(CO1), %ymm5,%ymm5 - -#endif - - vmovups %ymm4 , (CO1) - vmovups %ymm5 , 8 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL8x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 - vmulps %ymm2 , %ymm0 , %ymm12 - vaddps %ymm12, %ymm4 , %ymm4 - addq $ 1 , BI - addq $ 8 , %rax -.endm - -.macro SAVE8x1 - - vbroadcastss ALPHA, %ymm0 - - vmulps %ymm0 , %ymm4 , %ymm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %ymm4,%ymm4 - -#endif - - vmovups %ymm4 , (CO1) - -.endm - - - -/*******************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 - vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulps %xmm2 , %xmm0 , %xmm12 - vaddps %xmm12, %xmm4 , %xmm4 - addq $ 1 , BI - addq $ 4 , %rax -.endm - -.macro SAVE4x1 - - vbroadcastss ALPHA, %xmm0 - - vmulps %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddps (CO1), %xmm4,%xmm4 - -#endif - - vmovups %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulss %xmm2 , %xmm0 , %xmm12 - vmulss %xmm2 , %xmm1 , %xmm13 - vaddss %xmm12, %xmm4 , %xmm4 - vaddss %xmm13, %xmm5 , %xmm5 - addq $ 1 , BI - addq $ 2 , %rax -.endm - -.macro SAVE2x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - vmulss %xmm0 , %xmm5 , %xmm5 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - vaddss 1 * SIZE(CO1), %xmm5,%xmm5 - -#endif - - vmovss %xmm4 , (CO1) - vmovss %xmm5 , 1 * SIZE(CO1) - -.endm - - -/*******************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 - vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 - vmulss %xmm2 , %xmm0 , %xmm12 - vaddss %xmm12, %xmm4 , %xmm4 - addq $ 1 , BI - addq $ 1 , %rax -.endm - -.macro SAVE1x1 - - vmovss ALPHA, %xmm0 - - vmulss %xmm0 , %xmm4 , %xmm4 - -#if !defined(TRMMKERNEL) - - vaddss (CO1), %xmm4,%xmm4 - -#endif - - vmovss %xmm4 , (CO1) - -.endm - - -/*******************************************************************************************/ - -/************************************************************************************* -* TRMM Kernel -*************************************************************************************/ - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - movups %xmm6, 64(%rsp) - movups %xmm7, 80(%rsp) - movups %xmm8, 96(%rsp) - movups %xmm9, 112(%rsp) - movups %xmm10, 128(%rsp) - movups %xmm11, 144(%rsp) - movups %xmm12, 160(%rsp) - movups %xmm13, 176(%rsp) - movups %xmm14, 192(%rsp) - movups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovss %xmm0, ALPHA - - salq $BASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $4, %rdi - divq %rdi // N / 4 - movq %rax, Ndiv6 // N / 4 - movq %rdx, Nmod6 // N % 4 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - - movq Ndiv6, J - cmpq $0, J - je .L2_0 - ALIGN_4 - -/*******************************************************************************************/ - -.L4_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L4_01b - ALIGN_4 - - -.L4_01a: - prefetcht0 512(BO1) - prefetchw 512(BO) - - vmovups (BO1), %xmm0 - vmovups 4*SIZE(BO1), %xmm1 - vmovups 8*SIZE(BO1), %xmm2 - vmovups 12*SIZE(BO1), %xmm3 - - vmovups %xmm0, (BO) - vmovups %xmm1, 4*SIZE(BO) - vmovups %xmm2, 8*SIZE(BO) - vmovups %xmm3,12*SIZE(BO) - - addq $ 16*SIZE,BO1 - addq $ 16*SIZE,BO - decq %rax - jnz .L4_01a - - -.L4_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L4_02d - ALIGN_4 - -.L4_02c: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L4_02c - -.L4_02d: - - movq BO1, B // next offset of B - -.L4_10: - movq C, CO1 - leaq (C, LDC, 2), CO2 - leaq (C, LDC, 4), C // c += 4 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L4_20 - - ALIGN_4 - -.L4_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L4_16 - movq %rax, BI // Index for BO - leaq (,BI,4) , BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_12: - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - prefetcht0 A_PR1(AO, %rax, SIZE) - prefetcht0 B_PR1(BO, BI , SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - prefetcht0 A_PR1(AO, %rax, SIZE) - KERNEL16x4_SUB - - je .L4_16 - - jmp .L4_12 - ALIGN_4 - -.L4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_19 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_17: - - KERNEL16x4_SUB - - jl .L4_17 - ALIGN_4 - - -.L4_19: - - SAVE16x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - addq $16 * SIZE, CO2 # coffset += 16 - decq I # i -- - jg .L4_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L4_20: - // Test rest of M - - testq $15, M - jz .L4_60 // to next 3 lines of N - - testq $8, M - jz .L4_21pre - ALIGN_4 - -/**************************************************************************/ - -.L4_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_20_6 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_2: - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - KERNEL8x4_SUB - - je .L4_20_6 - - jmp .L4_20_2 - ALIGN_4 - -.L4_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_20_9 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_20_7: - - KERNEL8x4_SUB - - jl .L4_20_7 - ALIGN_4 - - -.L4_20_9: - - SAVE8x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - addq $8 * SIZE, CO2 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L4_21pre: - - testq $4, M - jz .L4_30 - ALIGN_4 - -.L4_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_26 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_22: - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - je .L4_26 - - jmp .L4_22 - ALIGN_4 - -.L4_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_29 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_27: - - KERNEL4x4_SUB - - jl .L4_27 - ALIGN_4 - - -.L4_29: - - SAVE4x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - - -.L4_30: - testq $2, M - jz .L4_40 - - ALIGN_4 - -.L4_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L4_36 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_32: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - je .L4_36 - - jmp .L4_32 - ALIGN_4 - -.L4_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_39 - - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_37: - - KERNEL2x4_SUB - - jl .L4_37 - ALIGN_4 - - -.L4_39: - - SAVE2x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - addq $2 * SIZE, CO2 # coffset += 2 - ALIGN_4 - -.L4_40: - testq $1, M - jz .L4_60 // to next 4 lines of N - - ALIGN_4 - -.L4_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $4, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L4_46 - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_42: - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - je .L4_46 - - jmp .L4_42 - ALIGN_4 - -.L4_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L4_49 - - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L4_47: - - KERNEL1x4_SUB - - jl .L4_47 - ALIGN_4 - - -.L4_49: - - SAVE1x4 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (,BI, 4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - addq $1 * SIZE, CO2 # coffset += 1 - ALIGN_4 - - - - - -.L4_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $4, KK -#endif - - decq J // j -- - jg .L4_01 // next 4 lines of N - - - -/*******************************************************************************************/ -.L2_0: - - movq Nmod6, J - andq $3, J // j % 4 - je .L999 - - movq Nmod6, J - andq $2, J // j % 4 - je .L1_0 - -.L2_01: - - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - sarq $2, %rax // K / 4 - jz .L2_01b - ALIGN_4 - -.L2_01a: - - vmovsd (BO1), %xmm0 - vmovsd 2*SIZE(BO1), %xmm1 - vmovsd 4*SIZE(BO1), %xmm2 - vmovsd 6*SIZE(BO1), %xmm3 - - vmovsd %xmm0, (BO) - vmovsd %xmm1, 2*SIZE(BO) - vmovsd %xmm2, 4*SIZE(BO) - vmovsd %xmm3, 6*SIZE(BO) - - addq $8*SIZE,BO1 - addq $8*SIZE,BO - decq %rax - jnz .L2_01a - - -.L2_01b: - - movq K, %rax - andq $3, %rax // K % 4 - jz .L2_02d - ALIGN_4 - -.L2_02c: - - vmovsd (BO1), %xmm0 - vmovsd %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L2_02c - -.L2_02d: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L2_20 - - ALIGN_4 - -.L2_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - KERNEL16x2_SUB - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL16x2_SUB - - jl .L2_17 - ALIGN_4 - - -.L2_19: - - SAVE16x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L2_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_20: - // Test rest of M - - testq $15, M - jz .L2_60 // to next 2 lines of N - - testq $8, M - jz .L2_21pre - ALIGN_4 - -/**************************************************************************/ - -.L2_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_20_6 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_2: - - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - KERNEL8x2_SUB - - je .L2_20_6 - - jmp .L2_20_2 - ALIGN_4 - -.L2_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_20_9 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_20_7: - - KERNEL8x2_SUB - - jl .L2_20_7 - ALIGN_4 - - -.L2_20_9: - - SAVE8x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L2_21pre: - - testq $4, M - jz .L2_30 - ALIGN_4 - -.L2_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_26 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 1 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_22: - - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - - je .L2_26 - - jmp .L2_22 - ALIGN_4 - -.L2_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_29 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_27: - - KERNEL4x2_SUB - - jl .L2_27 - ALIGN_4 - - -.L2_29: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L2_30: - testq $2, M - jz .L2_40 - - ALIGN_4 - -.L2_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L2_36 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_32: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_36 - - jmp .L2_32 - ALIGN_4 - -.L2_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_39 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_37: - - KERNEL2x2_SUB - - jl .L2_37 - ALIGN_4 - - -.L2_39: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L2_46 - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB - - jl .L2_47 - ALIGN_4 - - -.L2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BI,BI,1), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovss (BO1), %xmm0 - vmovss %xmm0, (BO) - addq $1*SIZE,BO1 - addq $1*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $16 * SIZE, AO - - movq M, I - sarq $4, I // i = (m >> 4) - je .L1_20 - - ALIGN_4 - -.L1_11: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $16, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - KERNEL16x1_SUB - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL16x1_SUB - - jl .L1_17 - ALIGN_4 - - -.L1_19: - - SAVE16x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $4, %rax // rax = rax * 16 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $16, KK -#endif - - addq $16 * SIZE, CO1 # coffset += 16 - decq I # i -- - jg .L1_11 - ALIGN_4 - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_20: - // Test rest of M - - testq $15, M - jz .L999 - - testq $8, M - jz .L1_21pre - ALIGN_4 - -/**************************************************************************/ - -.L1_20_1: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $8, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_20_6 - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_2: - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - KERNEL8x1_SUB - - je .L1_20_6 - - jmp .L1_20_2 - ALIGN_4 - -.L1_20_6: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_20_9 - - movq %rax, BI // Index for BO - - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_20_7: - - KERNEL8x1_SUB - - jl .L1_20_7 - ALIGN_4 - - -.L1_20_9: - - SAVE8x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $8, KK -#endif - - addq $8 * SIZE, CO1 # coffset += 8 - ALIGN_4 - - - -/**************************************************************************/ - -.L1_21pre: - - testq $4, M - jz .L1_30 - ALIGN_4 - -.L1_21: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $4, %rax // number of values in A -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_26 - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_22: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_26 - - jmp .L1_22 - ALIGN_4 - -.L1_26: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_29 - - movq %rax, BI // Index for BO - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_27: - - KERNEL4x1_SUB - - jl .L1_27 - ALIGN_4 - - -.L1_29: - - SAVE4x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $4, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -.L1_30: - testq $2, M - jz .L1_40 - - ALIGN_4 - -.L1_31: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax - je .L1_36 - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_32: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_36 - - jmp .L1_32 - ALIGN_4 - -.L1_36: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_39 - - movq %rax, BI // Index for BO - - salq $1, %rax // rax = rax *2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_37: - - KERNEL2x1_SUB - - jl .L1_37 - ALIGN_4 - - -.L1_39: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - andq $-8, %rax - je .L1_46 - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB - - jl .L1_47 - ALIGN_4 - - -.L1_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq (BO, BI, SIZE), BO - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $1 * SIZE, CO1 # coffset += 1 - ALIGN_4 - - -.L999: - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - movups 64(%rsp), %xmm6 - movups 80(%rsp), %xmm7 - movups 96(%rsp), %xmm8 - movups 112(%rsp), %xmm9 - movups 128(%rsp), %xmm10 - movups 144(%rsp), %xmm11 - movups 160(%rsp), %xmm12 - movups 176(%rsp), %xmm13 - movups 192(%rsp), %xmm14 - movups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE - - - - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define CO2 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + +#define A_PR1 512 +#define B_PR1 512 + +/******************************************************************************************* +* 4 lines of N +*******************************************************************************************/ + +.macro KERNEL16x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + vaddps %ymm14, %ymm6 , %ymm6 + vaddps %ymm15, %ymm7 , %ymm7 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm8 , %ymm8 + vaddps %ymm13, %ymm9 , %ymm9 + vaddps %ymm14, %ymm10, %ymm10 + vaddps %ymm15, %ymm11, %ymm11 + addq $ 4 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm9 , %ymm9 + vmulps %ymm0 , %ymm10, %ymm10 + vmulps %ymm0 , %ymm11, %ymm11 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + + vaddps (CO2), %ymm8,%ymm8 + vaddps 8 * SIZE(CO2), %ymm9,%ymm9 + + vaddps (CO2, LDC), %ymm10,%ymm10 + vaddps 8 * SIZE(CO2, LDC), %ymm11,%ymm11 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + + vmovups %ymm8 , (CO2) + vmovups %ymm9 , 8 * SIZE(CO2) + + vmovups %ymm10, (CO2, LDC) + vmovups %ymm11, 8 * SIZE(CO2, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + prefetcht0 64(CO2) + prefetcht0 64(CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm14, %ymm6 , %ymm6 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm8 , %ymm8 + vaddps %ymm14, %ymm10, %ymm10 + addq $ 4 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x4 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm8 , %ymm8 + vmulps %ymm0 , %ymm10, %ymm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps (CO2), %ymm8,%ymm8 + vaddps (CO2, LDC), %ymm10,%ymm10 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm8 , (CO2) + vmovups %ymm10, (CO2, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x4_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm4 , %xmm4 + vaddps %xmm14, %xmm6 , %xmm6 + vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm8 , %xmm8 + vaddps %xmm14, %xmm10, %xmm10 + addq $ 4 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x4 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + vmulps %xmm0 , %xmm8 , %xmm8 + vmulps %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + vaddps (CO2), %xmm8,%xmm8 + vaddps (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + vmovups %xmm8 , (CO2) + vmovups %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + vaddss %xmm14, %xmm6 , %xmm6 + vaddss %xmm15, %xmm7 , %xmm7 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm8 , %xmm8 + vaddss %xmm13, %xmm9 , %xmm9 + vaddss %xmm14, %xmm10, %xmm10 + vaddss %xmm15, %xmm11, %xmm11 + addq $ 4 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm9 , %xmm9 + vmulss %xmm0 , %xmm10, %xmm10 + vmulss %xmm0 , %xmm11, %xmm11 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + + vaddss (CO2), %xmm8,%xmm8 + vaddss 1 * SIZE(CO2), %xmm9,%xmm9 + + vaddss (CO2, LDC), %xmm10,%xmm10 + vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + + vmovss %xmm8 , (CO2) + vmovss %xmm9 , 1 * SIZE(CO2) + + vmovss %xmm10, (CO2, LDC) + vmovss %xmm11, 1 * SIZE(CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x4_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm14, %xmm6 , %xmm6 + vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm8 , %xmm8 + vaddss %xmm14, %xmm10, %xmm10 + addq $ 4 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x4 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm8 , %xmm8 + vmulss %xmm0 , %xmm10, %xmm10 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss (CO2), %xmm8,%xmm8 + vaddss (CO2, LDC), %xmm10,%xmm10 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm8 , (CO2) + vmovss %xmm10, (CO2, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 2 lines of N +*******************************************************************************************/ + +.macro KERNEL16x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vmulps %ymm3 , %ymm0 , %ymm14 + vmulps %ymm3 , %ymm1 , %ymm15 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + vaddps %ymm14, %ymm6 , %ymm6 + vaddps %ymm15, %ymm7 , %ymm7 + addq $ 2 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + vmulps %ymm0 , %ymm6 , %ymm6 + vmulps %ymm0 , %ymm7 , %ymm7 + + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + + vaddps (CO1, LDC), %ymm6,%ymm6 + vaddps 8 * SIZE(CO1, LDC), %ymm7,%ymm7 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + + vmovups %ymm6 , (CO1, LDC) + vmovups %ymm7 , 8 * SIZE(CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL8x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm3 , %ymm0 , %ymm14 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm14, %ymm6 , %ymm6 + addq $ 2 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x2 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm6 , %ymm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps (CO1, LDC), %ymm6,%ymm6 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm6 , (CO1, LDC) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulps %xmm2 , %xmm0 , %xmm12 + vmulps %xmm3 , %xmm0 , %xmm14 + vaddps %xmm12, %xmm4 , %xmm4 + vaddps %xmm14, %xmm6 , %xmm6 + addq $ 2 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x2 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + vmulps %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + vaddps (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovups %xmm4 , (CO1) + vmovups %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vmulss %xmm3 , %xmm0 , %xmm14 + vmulss %xmm3 , %xmm1 , %xmm15 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + vaddss %xmm14, %xmm6 , %xmm6 + vaddss %xmm15, %xmm7 , %xmm7 + addq $ 2 , BI + addq $ 2, %rax +.endm + +.macro SAVE2x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + vmulss %xmm0 , %xmm6 , %xmm6 + vmulss %xmm0 , %xmm7 , %xmm7 + + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + + vaddss (CO1, LDC), %xmm6,%xmm6 + vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + + vmovss %xmm6 , (CO1, LDC) + vmovss %xmm7 , 1 * SIZE(CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x2_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm3 , %xmm0 , %xmm14 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm14, %xmm6 , %xmm6 + addq $ 2 , BI + addq $ 1, %rax +.endm + +.macro SAVE1x2 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm6 , %xmm6 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss (CO1, LDC), %xmm6,%xmm6 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm6 , (CO1, LDC) + +.endm + + +/*******************************************************************************************/ + +/******************************************************************************************* +* 1 line of N +*******************************************************************************************/ + +.macro KERNEL16x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vmulps %ymm2 , %ymm0 , %ymm12 + vmulps %ymm2 , %ymm1 , %ymm13 + vaddps %ymm12, %ymm4 , %ymm4 + vaddps %ymm13, %ymm5 , %ymm5 + addq $ 1 , BI + addq $ 16, %rax +.endm + +.macro SAVE16x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + vmulps %ymm0 , %ymm5 , %ymm5 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + vaddps 8 * SIZE(CO1), %ymm5,%ymm5 + +#endif + + vmovups %ymm4 , (CO1) + vmovups %ymm5 , 8 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL8x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 + vmulps %ymm2 , %ymm0 , %ymm12 + vaddps %ymm12, %ymm4 , %ymm4 + addq $ 1 , BI + addq $ 8 , %rax +.endm + +.macro SAVE8x1 + + vbroadcastss ALPHA, %ymm0 + + vmulps %ymm0 , %ymm4 , %ymm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %ymm4,%ymm4 + +#endif + + vmovups %ymm4 , (CO1) + +.endm + + + +/*******************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 + vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulps %xmm2 , %xmm0 , %xmm12 + vaddps %xmm12, %xmm4 , %xmm4 + addq $ 1 , BI + addq $ 4 , %rax +.endm + +.macro SAVE4x1 + + vbroadcastss ALPHA, %xmm0 + + vmulps %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddps (CO1), %xmm4,%xmm4 + +#endif + + vmovups %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulss %xmm2 , %xmm0 , %xmm12 + vmulss %xmm2 , %xmm1 , %xmm13 + vaddss %xmm12, %xmm4 , %xmm4 + vaddss %xmm13, %xmm5 , %xmm5 + addq $ 1 , BI + addq $ 2 , %rax +.endm + +.macro SAVE2x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + vmulss %xmm0 , %xmm5 , %xmm5 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + vaddss 1 * SIZE(CO1), %xmm5,%xmm5 + +#endif + + vmovss %xmm4 , (CO1) + vmovss %xmm5 , 1 * SIZE(CO1) + +.endm + + +/*******************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 + vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 + vmulss %xmm2 , %xmm0 , %xmm12 + vaddss %xmm12, %xmm4 , %xmm4 + addq $ 1 , BI + addq $ 1 , %rax +.endm + +.macro SAVE1x1 + + vmovss ALPHA, %xmm0 + + vmulss %xmm0 , %xmm4 , %xmm4 + +#if !defined(TRMMKERNEL) + + vaddss (CO1), %xmm4,%xmm4 + +#endif + + vmovss %xmm4 , (CO1) + +.endm + + +/*******************************************************************************************/ + +/************************************************************************************* +* TRMM Kernel +*************************************************************************************/ + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovss %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $4, %rdi + divq %rdi // N / 4 + movq %rax, Ndiv6 // N / 4 + movq %rdx, Nmod6 // N % 4 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq Ndiv6, J + cmpq $0, J + je .L2_0 + ALIGN_4 + +/*******************************************************************************************/ + +.L4_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L4_01b + ALIGN_4 + + +.L4_01a: + prefetcht0 512(BO1) + prefetchw 512(BO) + + vmovups (BO1), %xmm0 + vmovups 4*SIZE(BO1), %xmm1 + vmovups 8*SIZE(BO1), %xmm2 + vmovups 12*SIZE(BO1), %xmm3 + + vmovups %xmm0, (BO) + vmovups %xmm1, 4*SIZE(BO) + vmovups %xmm2, 8*SIZE(BO) + vmovups %xmm3,12*SIZE(BO) + + addq $ 16*SIZE,BO1 + addq $ 16*SIZE,BO + decq %rax + jnz .L4_01a + + +.L4_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L4_02d + ALIGN_4 + +.L4_02c: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L4_02c + +.L4_02d: + + movq BO1, B // next offset of B + +.L4_10: + movq C, CO1 + leaq (C, LDC, 2), CO2 + leaq (C, LDC, 4), C // c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L4_20 + + ALIGN_4 + +.L4_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L4_16 + movq %rax, BI // Index for BO + leaq (,BI,4) , BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_12: + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + prefetcht0 A_PR1(AO, %rax, SIZE) + prefetcht0 B_PR1(BO, BI , SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + prefetcht0 A_PR1(AO, %rax, SIZE) + KERNEL16x4_SUB + + je .L4_16 + + jmp .L4_12 + ALIGN_4 + +.L4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_19 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_17: + + KERNEL16x4_SUB + + jl .L4_17 + ALIGN_4 + + +.L4_19: + + SAVE16x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + addq $16 * SIZE, CO2 # coffset += 16 + decq I # i -- + jg .L4_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L4_20: + // Test rest of M + + testq $15, M + jz .L4_60 // to next 3 lines of N + + testq $8, M + jz .L4_21pre + ALIGN_4 + +/**************************************************************************/ + +.L4_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_20_6 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_2: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + je .L4_20_6 + + jmp .L4_20_2 + ALIGN_4 + +.L4_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_20_9 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_20_7: + + KERNEL8x4_SUB + + jl .L4_20_7 + ALIGN_4 + + +.L4_20_9: + + SAVE8x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + addq $8 * SIZE, CO2 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L4_21pre: + + testq $4, M + jz .L4_30 + ALIGN_4 + +.L4_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_26 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + je .L4_26 + + jmp .L4_22 + ALIGN_4 + +.L4_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_29 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_27: + + KERNEL4x4_SUB + + jl .L4_27 + ALIGN_4 + + +.L4_29: + + SAVE4x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + + +.L4_30: + testq $2, M + jz .L4_40 + + ALIGN_4 + +.L4_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L4_36 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_32: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + je .L4_36 + + jmp .L4_32 + ALIGN_4 + +.L4_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_39 + + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_37: + + KERNEL2x4_SUB + + jl .L4_37 + ALIGN_4 + + +.L4_39: + + SAVE2x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + addq $2 * SIZE, CO2 # coffset += 2 + ALIGN_4 + +.L4_40: + testq $1, M + jz .L4_60 // to next 4 lines of N + + ALIGN_4 + +.L4_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $4, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L4_46 + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_42: + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + je .L4_46 + + jmp .L4_42 + ALIGN_4 + +.L4_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L4_49 + + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L4_47: + + KERNEL1x4_SUB + + jl .L4_47 + ALIGN_4 + + +.L4_49: + + SAVE1x4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (,BI, 4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + addq $1 * SIZE, CO2 # coffset += 1 + ALIGN_4 + + + + + +.L4_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + decq J // j -- + jg .L4_01 // next 4 lines of N + + + +/*******************************************************************************************/ +.L2_0: + + movq Nmod6, J + andq $3, J // j % 4 + je .L999 + + movq Nmod6, J + andq $2, J // j % 4 + je .L1_0 + +.L2_01: + + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + sarq $2, %rax // K / 4 + jz .L2_01b + ALIGN_4 + +.L2_01a: + + vmovsd (BO1), %xmm0 + vmovsd 2*SIZE(BO1), %xmm1 + vmovsd 4*SIZE(BO1), %xmm2 + vmovsd 6*SIZE(BO1), %xmm3 + + vmovsd %xmm0, (BO) + vmovsd %xmm1, 2*SIZE(BO) + vmovsd %xmm2, 4*SIZE(BO) + vmovsd %xmm3, 6*SIZE(BO) + + addq $8*SIZE,BO1 + addq $8*SIZE,BO + decq %rax + jnz .L2_01a + + +.L2_01b: + + movq K, %rax + andq $3, %rax // K % 4 + jz .L2_02d + ALIGN_4 + +.L2_02c: + + vmovsd (BO1), %xmm0 + vmovsd %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L2_02c + +.L2_02d: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L2_20 + + ALIGN_4 + +.L2_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL16x2_SUB + + jl .L2_17 + ALIGN_4 + + +.L2_19: + + SAVE16x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L2_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_20: + // Test rest of M + + testq $15, M + jz .L2_60 // to next 2 lines of N + + testq $8, M + jz .L2_21pre + ALIGN_4 + +/**************************************************************************/ + +.L2_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_20_6 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_2: + + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + je .L2_20_6 + + jmp .L2_20_2 + ALIGN_4 + +.L2_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_20_9 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_20_7: + + KERNEL8x2_SUB + + jl .L2_20_7 + ALIGN_4 + + +.L2_20_9: + + SAVE8x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L2_21pre: + + testq $4, M + jz .L2_30 + ALIGN_4 + +.L2_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_26 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 1 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_22: + + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + je .L2_26 + + jmp .L2_22 + ALIGN_4 + +.L2_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_29 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_27: + + KERNEL4x2_SUB + + jl .L2_27 + ALIGN_4 + + +.L2_29: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L2_30: + testq $2, M + jz .L2_40 + + ALIGN_4 + +.L2_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L2_36 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_32: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_36 + + jmp .L2_32 + ALIGN_4 + +.L2_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_39 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_37: + + KERNEL2x2_SUB + + jl .L2_37 + ALIGN_4 + + +.L2_39: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L2_46 + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB + + jl .L2_47 + ALIGN_4 + + +.L2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BI,BI,1), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovss (BO1), %xmm0 + vmovss %xmm0, (BO) + addq $1*SIZE,BO1 + addq $1*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $16 * SIZE, AO + + movq M, I + sarq $4, I // i = (m >> 4) + je .L1_20 + + ALIGN_4 + +.L1_11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $16, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL16x1_SUB + + jl .L1_17 + ALIGN_4 + + +.L1_19: + + SAVE16x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $4, %rax // rax = rax * 16 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $16, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 16 + decq I # i -- + jg .L1_11 + ALIGN_4 + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_20: + // Test rest of M + + testq $15, M + jz .L999 + + testq $8, M + jz .L1_21pre + ALIGN_4 + +/**************************************************************************/ + +.L1_20_1: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_20_6 + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_2: + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + je .L1_20_6 + + jmp .L1_20_2 + ALIGN_4 + +.L1_20_6: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_20_9 + + movq %rax, BI // Index for BO + + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_20_7: + + KERNEL8x1_SUB + + jl .L1_20_7 + ALIGN_4 + + +.L1_20_9: + + SAVE8x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 8 + ALIGN_4 + + + +/**************************************************************************/ + +.L1_21pre: + + testq $4, M + jz .L1_30 + ALIGN_4 + +.L1_21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax // number of values in A +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_26 + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_22: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_26 + + jmp .L1_22 + ALIGN_4 + +.L1_26: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_29 + + movq %rax, BI // Index for BO + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_27: + + KERNEL4x1_SUB + + jl .L1_27 + ALIGN_4 + + +.L1_29: + + SAVE4x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +.L1_30: + testq $2, M + jz .L1_40 + + ALIGN_4 + +.L1_31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax + je .L1_36 + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_32: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_36 + + jmp .L1_32 + ALIGN_4 + +.L1_36: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_39 + + movq %rax, BI // Index for BO + + salq $1, %rax // rax = rax *2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_37: + + KERNEL2x1_SUB + + jl .L1_37 + ALIGN_4 + + +.L1_39: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + je .L1_46 + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB + + jl .L1_47 + ALIGN_4 + + +.L1_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq (BO, BI, SIZE), BO + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO1 # coffset += 1 + ALIGN_4 + + +.L999: + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE + + + + + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 621ddc622b..c9681fa8b3 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "sgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 0be2c7e971..07aa515033 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/sscal.c b/kernel/x86_64/sscal.c new file mode 100644 index 0000000000..af1220f1bf --- /dev/null +++ b/kernel/x86_64/sscal.c @@ -0,0 +1,196 @@ +/*************************************************************************** +Copyright (c) 2013 - 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(HASWELL) || defined(ZEN) +#include "sscal_microk_haswell-2.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "sscal_microk_skylakex-2.c" +#endif + + +#if !defined(HAVE_KERNEL_16) + +static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + sscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + + while(j < n) + { + + x[i] *= da; + i += inc_x ; + j++; + + } + + } + + return(0); + } + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + if ( da == 0.0 ) + sscal_kernel_16_zero(n1 , &da , x); + else + sscal_kernel_16(n1 , &da , x); + } + + if ( da == 0.0 ) + { + for ( i=n1 ; i> 5 ; + BLASLONG n2 = n & 16 ; + + __asm__ __volatile__ + ( + "vbroadcastss (%2), %%ymm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulps -128(%1), %%ymm0, %%ymm4 \n\t" + "vmulps -96(%1), %%ymm0, %%ymm5 \n\t" + + "vmulps -64(%1), %%ymm0, %%ymm6 \n\t" + "vmulps -32(%1), %%ymm0, %%ymm7 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".p2align 4 \n\t" + "1: \n\t" + // "prefetcht0 640(%1) \n\t" + + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + "vmulps 0(%1), %%ymm0, %%ymm4 \n\t" + + // "prefetcht0 704(%1) \n\t" + + "vmovups %%ymm6 , -64(%1) \n\t" + "vmulps 32(%1), %%ymm0, %%ymm5 \n\t" + "vmovups %%ymm7 , -32(%1) \n\t" + + "vmulps 64(%1), %%ymm0, %%ymm6 \n\t" + "vmulps 96(%1), %%ymm0, %%ymm7 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + + "vmovups %%ymm6 , -64(%1) \n\t" + "vmovups %%ymm7 , -32(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $16 ,%3 \n\t" + "jne 5f \n\t" + + "vmulps -128(%1), %%ymm0, %%ymm4 \n\t" + "vmulps -96(%1), %%ymm0, %%ymm5 \n\t" + + "vmovups %%ymm4 ,-128(%1) \n\t" + "vmovups %%ymm5 , -96(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + "+r" (n1), // 0 + "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 5 ; + BLASLONG n2 = n & 16 ; + + __asm__ __volatile__ + ( + "vxorpd %%ymm0, %%ymm0 , %%ymm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".p2align 4 \n\t" + "1: \n\t" + + "vmovups %%ymm0 ,-128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + + "vmovups %%ymm0 , -64(%1) \n\t" + "vmovups %%ymm0 , -32(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $16 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%ymm0 ,-128(%1) \n\t" + "vmovups %%ymm0 , -96(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + "+r" (n1), // 0 + "+r" (x) // 1 + : + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c new file mode 100644 index 0000000000..c4fa160f05 --- /dev/null +++ b/kernel/x86_64/sscal_microk_skylakex-2.c @@ -0,0 +1,86 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + +#ifdef __AVX512CD__ + __m512 __alpha5 = _mm512_broadcastss_ps(_mm_load_ss(alpha)); + BLASLONG nn = n & -32; + for (; i < nn; i += 32) { + __m512 a = _mm512_loadu_ps(&x[i + 0]); + __m512 b = _mm512_loadu_ps(&x[i + 16]); + a *= __alpha5; + b *= __alpha5; + _mm512_storeu_ps(&x[i + 0], a); + _mm512_storeu_ps(&x[i + 16], b); + } + for (; i < n; i += 16) { + _mm512_storeu_ps(&x[i + 0], __alpha5 * _mm512_loadu_ps(&x[i + 0])); + } +#else + __m256 __alpha = _mm256_broadcastss_ps(_mm_load_ss(alpha)); + for (; i < n; i += 16) { + _mm256_storeu_ps(&x[i + 0], __alpha * _mm256_loadu_ps(&x[i + 0])); + _mm256_storeu_ps(&x[i + 8], __alpha * _mm256_loadu_ps(&x[i + 8])); + } +#endif +} + + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + int i = 0; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512 zero = _mm512_setzero_ps(); + for (; i < n; i += 16) { + _mm512_storeu_ps(&x[i], zero); + } +#else + __m256 zero = _mm256_setzero_ps(); + for (; i < n; i += 16) { + _mm256_storeu_ps(&x[i + 0], zero); + _mm256_storeu_ps(&x[i + 8], zero); + } +#endif + +} + +#else +#include "sscal_microk_haswell-2.c" +#endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 29d6a99583..45914daf5d 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 02bbc1c64c..26e5ca7e92 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif + #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "ssymv_U_microk_bulldozer-2.c" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c index 4e2cd4fe68..dbfcd55d7c 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c @@ -1,279 +1,279 @@ -#include "common.h" -#include -#include "strsm_kernel_8x4_haswell_R_common.h" - -#define SOLVE_RN_m8n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1)\ - SOLVE_ri_m8n2(56,6,7,%1)\ - SAVE_SOLUTION_m8n2(6,7,64) - -#define SOLVE_RN_m8n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(6,7,64)\ - SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(8,9,128)\ - SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ - SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ - SAVE_SOLUTION_m8n2(10,11,192) - -#define SOLVE_RN_m8n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ - SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(4,5,0)\ - SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(6,7,64)\ - SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(8,9,128)\ - SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(10,11,192)\ - SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(12,13,256)\ - SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ - SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ - SAVE_SOLUTION_m8n2(14,15,320) - -#define SOLVE_RN_m4n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1)\ - SOLVE_ri_m4n2(56,5,%1)\ - SAVE_SOLUTION_m4n2(5,32) - -#define SOLVE_RN_m4n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(5,32)\ - SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(6,64)\ - SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ - SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ - SAVE_SOLUTION_m4n2(7,96) - -#define SOLVE_RN_m4n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ - SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(4,0)\ - SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(5,32)\ - SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(6,64)\ - SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(7,96)\ - SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(8,128)\ - SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ - SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ - SAVE_SOLUTION_m4n2(9,160) - -#define SOLVE_RN_m2n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,0) - -#define SOLVE_RN_m2n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ - SAVE_SOLUTION_m2n4(4,5,0)\ - SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ - SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ - SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ - SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ - SAVE_SOLUTION_m2n4(6,7,32) - -#define SOLVE_RN_m2n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ - SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(4,5,0)\ - SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(6,7,32)\ - SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ - SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ - SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ - SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ - SAVE_SOLUTION_m2n4(8,9,64) - -#define SOLVE_RN_m1n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1)\ - SOLVE_col2_ltor_m1n4(16,4,%1)\ - SOLVE_col3_ltor_m1n4(32,4,%1)\ - SOLVE_col4_ltor_m1n4(48,4,%1)\ - SAVE_SOLUTION_m1n4(4,0) - -#define SOLVE_RN_m1n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ - SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ - SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ - SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ - SAVE_SOLUTION_m1n4(4,0)\ - SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ - SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ - SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ - SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ - SAVE_SOLUTION_m1n4(5,16) - -#define SOLVE_RN_m1n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ - SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(4,0)\ - SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(5,16)\ - SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ - SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ - SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ - SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ - SAVE_SOLUTION_m1n4(6,32) - -#define GEMM_RN_SIMPLE(mdim,ndim) \ - "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ - "1"#mdim""#ndim"1:\n\t"\ - GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ - "1"#mdim""#ndim"2:\n\t" -#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) -#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) -#define GEMM_RN_m8n12 \ - "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ - "cmpq $8,%5; jb 18122f;"\ - "18121:\n\t"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ - "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ - "18122:\n\t"\ - "testq %5,%5; jz 18124f;"\ - "18123:\n\t"\ - GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ - "18124:\n\t" -#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) -#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) -#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) -#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) -#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) -#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) -#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) -#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) -#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) - -#define COMPUTE(ndim) {\ - __asm__ __volatile__(\ - "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ - "cmpq $8,%%r11; jb "#ndim"772f;"\ - #ndim"771:\n\t"\ - GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ - #ndim"772:\n\t"\ - "testq $4,%%r11; jz "#ndim"773f;"\ - GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ - #ndim"773:\n\t"\ - "testq $2,%%r11; jz "#ndim"774f;"\ - GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ - #ndim"774:\n\t"\ - "testq $1,%%r11; jz "#ndim"775f;"\ - GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ - #ndim"775:\n\t"\ - "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ - :"r11","r12","r13","r14","r15","cc","memory",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ - a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ -} - -static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { - FLOAT a0, b0; - int i, j, k; - for (i=0; i7;m_count-=8){ - if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); - a_ptr += k * 8; c_ptr += 8; - } - for(;m_count>3;m_count-=4){ - if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); - a_ptr += k * 4; c_ptr += 4; - } - for(;m_count>1;m_count-=2){ - if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); - a_ptr += k * 2; c_ptr += 2; - } - if(m_count>0){ - if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); - solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); - a_ptr += k * 1; c_ptr += 1; - } -} -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ - float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; - float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; - BLASLONG n_count = n; - for(;n_count>11;n_count-=12) COMPUTE(12) - for(;n_count>7;n_count-=8) COMPUTE(8) - for(;n_count>3;n_count-=4) COMPUTE(4) - for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} - if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); - return 0; -} +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RN_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1)\ + SOLVE_ri_m8n2(56,6,7,%1)\ + SAVE_SOLUTION_m8n2(6,7,64) + +#define SOLVE_RN_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(10,11,192) + +#define SOLVE_RN_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(10,11,192)\ + SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(12,13,256)\ + SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(14,15,320) + +#define SOLVE_RN_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1)\ + SOLVE_ri_m4n2(56,5,%1)\ + SAVE_SOLUTION_m4n2(5,32) + +#define SOLVE_RN_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(7,96) + +#define SOLVE_RN_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(7,96)\ + SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(8,128)\ + SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(9,160) + +#define SOLVE_RN_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,0) + +#define SOLVE_RN_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(6,7,32) + +#define SOLVE_RN_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(6,7,32)\ + SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(8,9,64) + +#define SOLVE_RN_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1)\ + SOLVE_col2_ltor_m1n4(16,4,%1)\ + SOLVE_col3_ltor_m1n4(32,4,%1)\ + SOLVE_col4_ltor_m1n4(48,4,%1)\ + SAVE_SOLUTION_m1n4(4,0) + +#define SOLVE_RN_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(5,16) + +#define SOLVE_RN_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(5,16)\ + SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(6,32) + +#define GEMM_RN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) +#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) +#define GEMM_RN_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) +#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) +#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) +#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) +#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) +#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) +#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) +#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) +#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ +} + +static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0; i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c index ffcbfbbf06..9de3354de9 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c @@ -1,281 +1,281 @@ -#include "common.h" -#include -#include "strsm_kernel_8x4_haswell_R_common.h" - -#define SOLVE_RT_m8n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-128) - -#define SOLVE_RT_m8n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ - SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-112,4,5,%1)\ - SOLVE_le_m8n2(-128,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-256) - -#define SOLVE_RT_m8n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ - SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ - SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ - SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ - SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ - SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ - SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ - SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ - SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ - SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ - SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ - SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m8n2(-176,4,5,%1)\ - SOLVE_le_m8n2(-192,4,5,%1)\ - SAVE_SOLUTION_m8n2(4,5,-384) - -#define SOLVE_RT_m4n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(4,-64) - -#define SOLVE_RT_m4n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ - SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ - SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-112,4,%1)\ - SOLVE_le_m4n2(-128,4,%1)\ - SAVE_SOLUTION_m4n2(4,-128) - -#define SOLVE_RT_m4n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ - SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ - SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ - SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ - SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ - SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ - SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ - SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ - SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ - SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ - SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ - SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ - SOLVE_rile_m4n2(-176,4,%1)\ - SOLVE_le_m4n2(-192,4,%1)\ - SAVE_SOLUTION_m4n2(4,-192) - -#define SOLVE_RT_m2n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-32) - -#define SOLVE_RT_m2n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-64) - -#define SOLVE_RT_m2n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ - SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ - SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ - SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ - SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ - SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ - SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ - SAVE_SOLUTION_m2n4(4,5,-96) - -#define SOLVE_RT_m1n4 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(4,-16) - -#define SOLVE_RT_m1n8 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-80,4,%1)\ - SOLVE_col3_rtol_m1n4(-96,4,%1)\ - SOLVE_col2_rtol_m1n4(-112,4,%1)\ - SOLVE_col1_rtol_m1n4(-128,4,%1)\ - SAVE_SOLUTION_m1n4(4,-32) - -#define SOLVE_RT_m1n12 \ - "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ - SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ - SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ - SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ - SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ - SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ - SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ - SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ - SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ - SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ - SOLVE_col4_rtol_m1n4(-144,4,%1)\ - SOLVE_col3_rtol_m1n4(-160,4,%1)\ - SOLVE_col2_rtol_m1n4(-176,4,%1)\ - SOLVE_col1_rtol_m1n4(-192,4,%1)\ - SAVE_SOLUTION_m1n4(4,-48) - -/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ -#define GEMM_RT_SIMPLE(mdim,ndim) \ - "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ - "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ - "1"#mdim""#ndim"1:\n\t"\ - "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ - "1"#mdim""#ndim"2:\n\t" -#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) -#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) -#define GEMM_RT_m8n12 \ - "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ - "cmpq $8,%5; jb 18122f;"\ - "18121:\n\t"\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ - "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ - "18122:\n\t"\ - "testq %5,%5; jz 18124f;"\ - "18123:\n\t"\ - "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ - "18124:\n\t" -#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) -#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) -#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) -#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) -#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) -#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) -#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) -#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) -#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) - -#define COMPUTE(ndim) {\ - b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ - __asm__ __volatile__(\ - "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ - "cmpq $8,%%r11; jb "#ndim"772f;"\ - #ndim"771:\n\t"\ - GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ - #ndim"772:\n\t"\ - "testq $4,%%r11; jz "#ndim"773f;"\ - GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ - #ndim"773:\n\t"\ - "testq $2,%%r11; jz "#ndim"774f;"\ - GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ - #ndim"774:\n\t"\ - "testq $1,%%r11; jz "#ndim"775f;"\ - GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ - #ndim"775:\n\t"\ - "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ - :"r11","r12","r13","r14","r15","cc","memory",\ - "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ - a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ -} - -static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ - FLOAT a0, b0; - int i, j, k; - for (i=n-1;i>=0;i--) { - b0 = b[i*n+i]; - for (j=0;j7;m_count-=8){ - if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); - solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 8; c_ptr += 8; - } - for(;m_count>3;m_count-=4){ - if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); - solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 4; c_ptr += 4; - } - for(;m_count>1;m_count-=2){ - if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); - solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 2; c_ptr += 2; - } - if(m_count>0){ - if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); - solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); - a_ptr += k * 1; c_ptr += 1; - } -} -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ - float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; - float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; - float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; - uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; - BLASLONG n_count = n; - if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} - if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} - for(;n_count>11;n_count-=12) COMPUTE(12) - for(;n_count>7;n_count-=8) COMPUTE(8) - for(;n_count>3;n_count-=4) COMPUTE(4) - return 0; -} +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-128) + +#define SOLVE_RT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-256) + +#define SOLVE_RT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ + SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-176,4,5,%1)\ + SOLVE_le_m8n2(-192,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-384) + +#define SOLVE_RT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(4,-64) + +#define SOLVE_RT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(4,-128) + +#define SOLVE_RT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ + SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ + SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-176,4,%1)\ + SOLVE_le_m4n2(-192,4,%1)\ + SAVE_SOLUTION_m4n2(4,-192) + +#define SOLVE_RT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-32) + +#define SOLVE_RT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-64) + +#define SOLVE_RT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-96) + +#define SOLVE_RT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(4,-16) + +#define SOLVE_RT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(4,-32) + +#define SOLVE_RT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-144,4,%1)\ + SOLVE_col3_rtol_m1n4(-160,4,%1)\ + SOLVE_col2_rtol_m1n4(-176,4,%1)\ + SOLVE_col1_rtol_m1n4(-192,4,%1)\ + SAVE_SOLUTION_m1n4(4,-48) + +/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ +#define GEMM_RT_SIMPLE(mdim,ndim) \ + "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) +#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) +#define GEMM_RT_m8n12 \ + "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) +#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) +#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) +#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) +#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) +#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) +#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) +#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) +#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ +} + +static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ + FLOAT a0, b0; + int i, j, k; + for (i=n-1;i>=0;i--) { + b0 = b[i*n+i]; + for (j=0;j7;m_count-=8){ + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; + BLASLONG n_count = n; + if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} + if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h index 36b7aa1a38..970d63578f 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h @@ -1,226 +1,226 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ -/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ -/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ - -#define init_m8n4(c1,c2,c3,c4)\ - "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ - "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" -#define INIT_m8n4 init_m8n4(4,5,6,7) -#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) -#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) - -#define init_m4n4(c1,c2,c3,c4)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ - "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" -#define INIT_m4n4 init_m4n4(4,5,6,7) -#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) -#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) - -#define init_m2n4(c1,c2)\ - "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" -#define INIT_m2n4 init_m2n4(4,5) -#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) -#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) - -#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" -#define INIT_m1n4 init_m1n4(4) -#define INIT_m1n8 INIT_m1n4 init_m1n4(5) -#define INIT_m1n12 INIT_m1n8 init_m1n4(6) - -#define GEMM_KERNEL_k1m8n4 \ - "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ - "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ - "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" -#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ - "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ - "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" -#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ - "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ - "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" - -#define GEMM_KERNEL_k1m4n4 \ - "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ - "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ - "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ - "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ - "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" -#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ - "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ - "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" - -#define GEMM_KERNEL_k1m2n4 \ - "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ - "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" -#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ - "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" -#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ - "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" - -#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" -#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" -#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" - -#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ - "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ - "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" - -#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ - "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ - "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ - "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ - "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ - "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" - -#define GEMM_SUM_REORDER_2x4(c1,c2)\ - "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ - "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ - "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ - -#define GEMM_SUM_REORDER_1x4(c1)\ - "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ - "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" - -#define SOLVE_le_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovsldup %%ymm"#c1",%%ymm1;" - -#define SOLVE_le_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" - -#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_ri_m4n2(b_off,c1,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ - "vmovshdup %%ymm"#c1",%%ymm1;" - -#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ - "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ - "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ - "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" - -#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1;" - -#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ - "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ - "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ - "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" - -#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ - "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" - -#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" - -#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" - -#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" - -#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ - "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ - "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ - "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m4n2(c1,a_off)\ - "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ - "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ - "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" - -#define SAVE_SOLUTION_m1n4(c1,a_off)\ - "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ - "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ + "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ + "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ + "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2)\ + "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ + "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define SOLVE_le_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_le_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_ri_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m4n2(c1,a_off)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m1n4(c1,a_off)\ + "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index c52575d078..27397ccfa1 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(BULLDOZER) #include "zdot_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S index 94e2f61174..6c8b4c8722 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S +++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S @@ -1,1404 +1,1404 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 384 -#define B_PR1 192 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 384 +#define B_PR1 192 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S index 848b6f2371..bffe5439db 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S +++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S @@ -1,1429 +1,1429 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/********************************************************************* -* -* 2014/06/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/10/30 Saar -* -* Parameter: -* UNROLL_M 2 -* UNROLL_N 2 -* ZGEMM_P 384 -* ZGEMM_Q 168 -* A_PR1 512 -* B_PR1 256 -* -* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): -* -* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) -* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) -* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) -* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) -* -* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): -* -* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) -* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) -* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) -* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) -* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) -* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) -* -*********************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 256*8*4 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $0, 4096 * 4(%rsp);\ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $0, 4096 * 3(%rsp);\ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $0, 4096 * 2(%rsp);\ - movl $0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfmaddpd -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfmaddpd -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -#define VFMADD_R vfmaddpd -#define VFMADD_I vfnmaddpd -#else -#define VFMADD_R vfnmaddpd -#define VFMADD_I vfnmaddpd -#endif - - -#define A_PR1 512 -#define B_PR1 256 - -#define KERNEL2x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - -#define KERNEL2x2_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $16, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ - addq $4, BI ;\ - addq $4, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL1x2_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - -#define KERNEL1x2_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $16, BI ;\ - addq $8 , %rax ;\ - - -#define KERNEL1x2_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ - VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ - VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ - addq $4, BI ;\ - addq $2, %rax ;\ - -/************************************************************************************************/ - -#define KERNEL2x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_2(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_3(xx) \ - prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ - vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - -#define KERNEL2x1_4(xx) \ - vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $8, BI ;\ - addq $16, %rax ;\ - - -#define KERNEL2x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ - VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ - addq $2, BI ;\ - addq $4, %rax ;\ - - -/************************************************************************************************/ - -#define KERNEL1x1_1(xx) \ - prefetcht0 A_PR1(AO,%rax,SIZE) ;\ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_2(xx) \ - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_3(xx) \ - vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - -#define KERNEL1x1_4(xx) \ - vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $8, BI ;\ - addq $8, %rax ;\ - - -#define KERNEL1x1_SUB(xx) \ - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ - VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ - VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ - addq $2, BI ;\ - addq $2, %rax ;\ - - -/************************************************************************************************/ - - - - - PROLOGUE - PROFCODE - - subq $STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - vmovsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - vmovsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $128 + L_BUFFER_SIZE, %rsp - andq $-4096, %rsp # align stack - - STACK_TOUCH - - cmpq $0, OLD_M - je .L999 - - cmpq $0, OLD_N - je .L999 - - cmpq $0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_0: - - movq Ndiv6, J - cmpq $0, J - je .L1_0 - ALIGN_4 - - - -.L2_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $4*SIZE,BO1 - addq $4*SIZE,BO - decq %rax - jnz .L2_02b - -.L2_02c: - - movq BO1, B // next offset of B - -.L2_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L2_40 - - ALIGN_4 - -.L2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_1(xxx) - KERNEL2x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL2x2_3(xxx) - KERNEL2x2_4(xxx) - - je .L2_16 - - jmp .L2_12 - ALIGN_4 - -.L2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_17: - - KERNEL2x2_SUB(xxx) - jl .L2_17 - ALIGN_4 - - -.L2_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - vshufpd $0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - vshufpd $0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L2_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_40: - testq $1, M - jz .L2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_1(xxx) - KERNEL1x2_2(xxx) - prefetcht0 B_PR1+64(BO,BI,SIZE) - KERNEL1x2_3(xxx) - KERNEL1x2_4(xxx) - - je .L2_46 - - jmp .L2_42 - ALIGN_4 - -.L2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_47: - - KERNEL1x2_SUB(xxx) - jl .L2_47 - ALIGN_4 - - -.L2_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - - -.L2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $2, KK -#endif - - decq J // j -- - jg .L2_01 // next 2 lines of N - - - -.L1_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $2*SIZE,BO1 - addq $2*SIZE,BO - decq %rax - jnz .L1_02b - -.L1_02c: - - movq BO1, B // next offset of B - -.L1_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $8 * SIZE, AO - - movq M, I - sarq $1, I // i = (m >> 1) - je .L1_40 - - ALIGN_4 - -.L1_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $2, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_12: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_1(xxx) - KERNEL2x1_2(xxx) - KERNEL2x1_3(xxx) - KERNEL2x1_4(xxx) - - je .L1_16 - - jmp .L1_12 - ALIGN_4 - -.L1_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_17: - - KERNEL2x1_SUB(xxx) - jl .L1_17 - ALIGN_4 - - -.L1_19: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $2, KK -#endif - - addq $4 * SIZE, CO1 # coffset += 4 - decq I # i -- - jg .L1_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_40: - testq $1, M - jz .L999 - - ALIGN_4 - -.L1_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $1, %rax // number of values in AO -#else - addq $1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $-8, %rax // K = K - ( K % 8 ) - je .L1_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_42: - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_1(xxx) - KERNEL1x1_2(xxx) - KERNEL1x1_3(xxx) - KERNEL1x1_4(xxx) - - je .L1_46 - - jmp .L1_42 - ALIGN_4 - -.L1_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $7, %rax # if (k & 1) - je .L1_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_47: - - KERNEL1x1_SUB(xxx) - jl .L1_47 - ALIGN_4 - - -.L1_49: - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $1, KK -#endif - - addq $2 * SIZE, CO1 # coffset += 2 - ALIGN_4 - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $STACKSIZE, %rsp - ret - - EPILOGUE +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/********************************************************************* +* +* 2014/06/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/30 Saar +* +* Parameter: +* UNROLL_M 2 +* UNROLL_N 2 +* ZGEMM_P 384 +* ZGEMM_Q 168 +* A_PR1 512 +* B_PR1 256 +* +* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): +* +* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) +* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) +* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) +* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) +* +* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): +* +* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) +* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) +* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) +* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) +* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) +* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) +* +*********************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 256*8*4 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfmaddpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfmaddpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define VFMADD_R vfmaddpd +#define VFMADD_I vfnmaddpd +#else +#define VFMADD_R vfnmaddpd +#define VFMADD_I vfnmaddpd +#endif + + +#define A_PR1 512 +#define B_PR1 256 + +#define KERNEL2x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + +#define KERNEL2x2_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $16, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ + addq $4, BI ;\ + addq $4, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL1x2_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + +#define KERNEL1x2_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $16, BI ;\ + addq $8 , %rax ;\ + + +#define KERNEL1x2_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ + VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ + VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ + addq $4, BI ;\ + addq $2, %rax ;\ + +/************************************************************************************************/ + +#define KERNEL2x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_2(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_3(xx) \ + prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ + vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + +#define KERNEL2x1_4(xx) \ + vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $8, BI ;\ + addq $16, %rax ;\ + + +#define KERNEL2x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ + VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ + addq $2, BI ;\ + addq $4, %rax ;\ + + +/************************************************************************************************/ + +#define KERNEL1x1_1(xx) \ + prefetcht0 A_PR1(AO,%rax,SIZE) ;\ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_2(xx) \ + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_3(xx) \ + vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + +#define KERNEL1x1_4(xx) \ + vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $8, BI ;\ + addq $8, %rax ;\ + + +#define KERNEL1x1_SUB(xx) \ + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ + VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ + VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ + addq $2, BI ;\ + addq $2, %rax ;\ + + +/************************************************************************************************/ + + + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + vmovsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + vmovsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $128 + L_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCH + + cmpq $0, OLD_M + je .L999 + + cmpq $0, OLD_N + je .L999 + + cmpq $0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_0: + + movq Ndiv6, J + cmpq $0, J + je .L1_0 + ALIGN_4 + + + +.L2_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $4*SIZE,BO1 + addq $4*SIZE,BO + decq %rax + jnz .L2_02b + +.L2_02c: + + movq BO1, B // next offset of B + +.L2_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L2_40 + + ALIGN_4 + +.L2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_1(xxx) + KERNEL2x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL2x2_3(xxx) + KERNEL2x2_4(xxx) + + je .L2_16 + + jmp .L2_12 + ALIGN_4 + +.L2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_17: + + KERNEL2x2_SUB(xxx) + jl .L2_17 + ALIGN_4 + + +.L2_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + vshufpd $0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + vshufpd $0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L2_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_40: + testq $1, M + jz .L2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_1(xxx) + KERNEL1x2_2(xxx) + prefetcht0 B_PR1+64(BO,BI,SIZE) + KERNEL1x2_3(xxx) + KERNEL1x2_4(xxx) + + je .L2_46 + + jmp .L2_42 + ALIGN_4 + +.L2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_47: + + KERNEL1x2_SUB(xxx) + jl .L2_47 + ALIGN_4 + + +.L2_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + + +.L2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + decq J // j -- + jg .L2_01 // next 2 lines of N + + + +.L1_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $2*SIZE,BO1 + addq $2*SIZE,BO + decq %rax + jnz .L1_02b + +.L1_02c: + + movq BO1, B // next offset of B + +.L1_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $8 * SIZE, AO + + movq M, I + sarq $1, I // i = (m >> 1) + je .L1_40 + + ALIGN_4 + +.L1_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_12: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_1(xxx) + KERNEL2x1_2(xxx) + KERNEL2x1_3(xxx) + KERNEL2x1_4(xxx) + + je .L1_16 + + jmp .L1_12 + ALIGN_4 + +.L1_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_17: + + KERNEL2x1_SUB(xxx) + jl .L1_17 + ALIGN_4 + + +.L1_19: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L1_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_40: + testq $1, M + jz .L999 + + ALIGN_4 + +.L1_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax // number of values in AO +#else + addq $1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $-8, %rax // K = K - ( K % 8 ) + je .L1_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_42: + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_1(xxx) + KERNEL1x1_2(xxx) + KERNEL1x1_3(xxx) + KERNEL1x1_4(xxx) + + je .L1_46 + + jmp .L1_42 + ALIGN_4 + +.L1_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $7, %rax # if (k & 1) + je .L1_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_47: + + KERNEL1x1_SUB(xxx) + jl .L1_47 + ALIGN_4 + + +.L1_49: + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 2 + ALIGN_4 + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S index f91bfa89bb..29729b1017 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S @@ -1,3881 +1,3881 @@ -/********************************************************************************* -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************/ - -/******************************************************************************** -* 2014/07/28 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* 2013/10/28 Saar -* Parameter: -* ZGEMM_DEFAULT_UNROLL_N 2 -* ZGEMM_DEFAULT_UNROLL_M 4 -* ZGEMM_DEFAULT_P 256 -* ZGEMM_DEFAULT_Q 128 -* A_PR1 512 -* B_PR1 512 -* -* 2014/07/28 Saar -* Performance at 4608x4608x4608: -* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) -* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) -* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) -* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) -* -********************************************************************************/ - - -#define ASSEMBLER -#include "common.h" - -#define OLD_M %rdi -#define OLD_N %rsi -#define M %r13 -#define J %r14 -#define OLD_K %rdx - -#define A %rcx -#define B %r8 -#define C %r9 -#define LDC %r10 - -#define I %r11 -#define AO %rdi -#define BO %rsi -#define CO1 %r15 -#define K %r12 -#define BI %rbp -#define SP %rbx - -#define BO1 %rdi -#define BO2 %r15 - -#ifndef WINDOWS_ABI - -#define STACKSIZE 96 - -#else - -#define STACKSIZE 320 - -#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) -#define OLD_A 48 + STACKSIZE(%rsp) -#define OLD_B 56 + STACKSIZE(%rsp) -#define OLD_C 64 + STACKSIZE(%rsp) -#define OLD_LDC 72 + STACKSIZE(%rsp) -#define OLD_OFFSET 80 + STACKSIZE(%rsp) - -#endif - -#define L_BUFFER_SIZE 8192 - -#define Ndiv6 24(%rsp) -#define Nmod6 32(%rsp) -#define N 40(%rsp) -#define ALPHA_R 48(%rsp) -#define ALPHA_I 56(%rsp) -#define OFFSET 64(%rsp) -#define KK 72(%rsp) -#define KKK 80(%rsp) -#define BUFFER1 128(%rsp) - -#if defined(OS_WINDOWS) -#if L_BUFFER_SIZE > 16384 -#define STACK_TOUCH \ - movl $ 0, 4096 * 4(%rsp);\ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 12288 -#define STACK_TOUCH \ - movl $ 0, 4096 * 3(%rsp);\ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 8192 -#define STACK_TOUCH \ - movl $ 0, 4096 * 2(%rsp);\ - movl $ 0, 4096 * 1(%rsp); -#elif L_BUFFER_SIZE > 4096 -#define STACK_TOUCH \ - movl $ 0, 4096 * 1(%rsp); -#else -#define STACK_TOUCH -#endif -#else -#define STACK_TOUCH -#endif - - -#if defined(BULLDOZER) - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 - -#endif - -#else - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) - -#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#else - -#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 - -#endif - -#endif - -#define A_PR1 512 -#define B_PR1 512 - - - -/***************************************************************************************************/ - -.macro KERNEL4x3_SUB - vmovups (AO), %ymm0 - vmovups 4 * SIZE(AO), %ymm1 - prefetcht0 A_PR1(AO) - - vbroadcastsd (BO), %ymm2 - vbroadcastsd 1 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) - - vbroadcastsd 2 * SIZE(BO), %ymm2 - vbroadcastsd 3 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) - - vbroadcastsd 4 * SIZE(BO), %ymm2 - vbroadcastsd 5 * SIZE(BO), %ymm3 - VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) - VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) - VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) - VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) - - addq $ 6*SIZE, BO - addq $ 8*SIZE, AO - decq %rax -.endm - -.macro SAVE4x3 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 - vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - vaddsubpd %ymm4 , %ymm5 ,%ymm5 - vaddsubpd %ymm6 , %ymm7 ,%ymm7 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - vmovapd %ymm5 , %ymm4 - vmovapd %ymm7 , %ymm6 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 - vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - vmulpd %ymm4 , %ymm0, %ymm4 - vmulpd %ymm6 , %ymm0, %ymm6 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - vmulpd %ymm5 , %ymm1, %ymm5 - vmulpd %ymm7 , %ymm1, %ymm7 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - vaddsubpd %ymm5 ,%ymm4 , %ymm4 - vaddsubpd %ymm7 ,%ymm6 , %ymm6 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - - vaddpd (CO1, LDC,2), %ymm4 , %ymm4 - vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - vmovups %ymm4 , (CO1, LDC, 2) - vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - - - -/***************************************************************************************************/ - -.macro KERNEL2x3_SUB - vmovups (AO), %xmm0 - vmovups 2 * SIZE(AO), %xmm1 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) - - addq $ 6*SIZE, BO - addq $ 4*SIZE, AO - decq %rax -.endm - -.macro SAVE2x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - vaddsubpd %xmm4, %xmm5 ,%xmm5 - vaddsubpd %xmm6, %xmm7 ,%xmm7 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - vmovapd %xmm5, %xmm4 - vmovapd %xmm7, %xmm6 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - vmulpd %xmm4 , %xmm0, %xmm4 - vmulpd %xmm6 , %xmm0, %xmm6 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - vmulpd %xmm5 , %xmm1, %xmm5 - vmulpd %xmm7 , %xmm1, %xmm7 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - vaddsubpd %xmm5, %xmm4 , %xmm4 - vaddsubpd %xmm7, %xmm6 , %xmm6 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - - vaddpd (CO1, LDC,2), %xmm4 , %xmm4 - vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - - vmovups %xmm4 , (CO1, LDC,2) - vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) - -.endm - - -/************************************************************************************************/ - - -.macro KERNEL1x3_SUB - vmovups (AO), %xmm0 - vmovddup (BO), %xmm2 - vmovddup 1 * SIZE(BO), %xmm3 - - VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) - - vmovddup 2 * SIZE(BO), %xmm2 - vmovddup 3 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) - - vmovddup 4 * SIZE(BO), %xmm2 - vmovddup 5 * SIZE(BO), %xmm3 - VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) - VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) - - addq $ 6*SIZE, BO - addq $ 2*SIZE, AO - decq %rax -.endm - -.macro SAVE1x3 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - vaddsubpd %xmm4, %xmm5, %xmm5 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm5, %xmm4 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm4 , %xmm0, %xmm4 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm5 , %xmm1, %xmm5 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm5, %xmm4 , %xmm4 - -#ifndef TRMMKERNEL - - vaddpd (CO1) , %xmm8 , %xmm8 - vaddpd (CO1, LDC) , %xmm10, %xmm10 - vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm4 , (CO1, LDC,2) - -.endm - - - - -/***************************************************************************************************/ - -.macro KERNEL4x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - - vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 - vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 - VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) - VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) - VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) - VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) - - addq $ 4, BI - addq $ 8, %rax -.endm - -.macro SAVE4x2 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 - -#else - vaddsubpd %ymm8, %ymm9 ,%ymm9 - vaddsubpd %ymm10, %ymm11,%ymm11 - vaddsubpd %ymm12, %ymm13,%ymm13 - vaddsubpd %ymm14, %ymm15,%ymm15 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm11, %ymm10 - vmovapd %ymm13, %ymm12 - vmovapd %ymm15, %ymm14 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm10, %ymm0, %ymm10 - vmulpd %ymm12, %ymm0, %ymm12 - vmulpd %ymm14, %ymm0, %ymm14 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm11, %ymm1, %ymm11 - vmulpd %ymm13, %ymm1, %ymm13 - vmulpd %ymm15, %ymm1, %ymm15 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm11,%ymm10, %ymm10 - vaddsubpd %ymm13,%ymm12, %ymm12 - vaddsubpd %ymm15,%ymm14, %ymm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - - vaddpd (CO1, LDC), %ymm10, %ymm10 - vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 , 4 * SIZE(CO1) - - vmovups %ymm10 , (CO1, LDC) - vmovups %ymm14 , 4 * SIZE(CO1, LDC) - - prefetcht0 64(CO1) - prefetcht0 64(CO1, LDC) - -.endm - -/***************************************************************************************************/ - -.macro KERNEL2x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) - addq $ 4, BI - addq $ 4, %rax -.endm - -.macro SAVE2x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 - -#else - vaddsubpd %xmm8, %xmm9 ,%xmm9 - vaddsubpd %xmm10, %xmm11,%xmm11 - vaddsubpd %xmm12, %xmm13,%xmm13 - vaddsubpd %xmm14, %xmm15,%xmm15 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - vmovapd %xmm13, %xmm12 - vmovapd %xmm15, %xmm14 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - vmulpd %xmm12, %xmm0, %xmm12 - vmulpd %xmm14, %xmm0, %xmm14 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - vmulpd %xmm13, %xmm1, %xmm13 - vmulpd %xmm15, %xmm1, %xmm15 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - vaddsubpd %xmm13,%xmm12, %xmm12 - vaddsubpd %xmm15,%xmm14, %xmm14 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - - vaddpd (CO1, LDC), %xmm10, %xmm10 - vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - - vmovups %xmm10 , (CO1, LDC) - vmovups %xmm14 , 2 * SIZE(CO1, LDC) - -.endm - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.macro KERNEL1x2_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 - vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 - vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 - VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) - VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) - addq $ 4, BI - addq $ 2, %rax -.endm - -.macro SAVE1x2 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - vaddsubpd %xmm10,%xmm11, %xmm11 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm11, %xmm10 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm10, %xmm0, %xmm10 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm11, %xmm1, %xmm11 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm11,%xmm10, %xmm10 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd (CO1, LDC), %xmm10, %xmm10 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm10 , (CO1, LDC) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL4x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 - vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 - vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 - vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 - VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) - VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) - VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) - VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) - - addq $ 2, BI - addq $ 8, %rax -.endm - -.macro SAVE4x1 - - vbroadcastsd ALPHA_R, %ymm0 - vbroadcastsd ALPHA_I, %ymm1 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13,%ymm12 , %ymm12 - - vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 - vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 - -#else - vaddsubpd %ymm8, %ymm9 , %ymm9 - vaddsubpd %ymm12,%ymm13, %ymm13 - - vmovapd %ymm9, %ymm8 - vmovapd %ymm13, %ymm12 - - // swap high and low 8 bytes - vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 - vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 - -#endif - - // multiply with ALPHA_R - vmulpd %ymm8 , %ymm0, %ymm8 - vmulpd %ymm12, %ymm0, %ymm12 - - // multiply with ALPHA_I - vmulpd %ymm9 , %ymm1, %ymm9 - vmulpd %ymm13, %ymm1, %ymm13 - - vaddsubpd %ymm9, %ymm8 , %ymm8 - vaddsubpd %ymm13, %ymm12, %ymm12 - - - -#ifndef TRMMKERNEL - - vaddpd (CO1), %ymm8 , %ymm8 - vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 - -#endif - - vmovups %ymm8 , (CO1) - vmovups %ymm12 ,4 * SIZE(CO1) - -.endm - - - -/************************************************************************************************/ - -.macro KERNEL2x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 - VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) - addq $ 2, BI - addq $ 4, %rax -.endm - -.macro SAVE2x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13,%xmm12 , %xmm12 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 - -#else - vaddsubpd %xmm8, %xmm9 , %xmm9 - vaddsubpd %xmm12,%xmm13, %xmm13 - - vmovapd %xmm9, %xmm8 - vmovapd %xmm13, %xmm12 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - vmulpd %xmm12, %xmm0, %xmm12 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - vmulpd %xmm13, %xmm1, %xmm13 - - vaddsubpd %xmm9, %xmm8 , %xmm8 - vaddsubpd %xmm13, %xmm12, %xmm12 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 - -#endif - - vmovups %xmm8 , (CO1) - vmovups %xmm12 , 2 * SIZE(CO1) - -.endm - - -/************************************************************************************************/ - -.macro KERNEL1x1_SUB - vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 - vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 - VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) - vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 - VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) - addq $ 2, BI - addq $ 2, %rax -.endm - -.macro SAVE1x1 - - vmovddup ALPHA_R, %xmm0 - vmovddup ALPHA_I, %xmm1 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ - defined(NR) || defined(NC) || defined(TR) || defined(TC) - - vaddsubpd %xmm9, %xmm8, %xmm8 - - vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 - -#else - vaddsubpd %xmm8, %xmm9, %xmm9 - - vmovapd %xmm9, %xmm8 - - // swap high and low 64 bytes - vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 - -#endif - - // multiply with ALPHA_R - vmulpd %xmm8 , %xmm0, %xmm8 - - // multiply with ALPHA_I - vmulpd %xmm9 , %xmm1, %xmm9 - - vaddsubpd %xmm9 ,%xmm8, %xmm8 - -#ifndef TRMMKERNEL - - vaddpd (CO1), %xmm8 , %xmm8 - -#endif - - vmovups %xmm8 , (CO1) - -.endm - - -/************************************************************************************************/ - - - -#if !defined(TRMMKERNEL) - - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 6, %rdi - divq %rdi // N / 6 - movq %rax, Ndiv6 // N / 6 - movq %rdx, Nmod6 // N % 6 - - - -/************************************************************************************************/ -.L6_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L2_00_0 - ALIGN_4 - - - -.L6_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq BO2, B // next offset of B - movq K, %rax - ALIGN_4 - -.L6_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups (BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L6_00_02b - -.L6_00_02c: - - - -.L6_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L6_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L6_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_4_16 - ALIGN_4 - -.L6_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L6_4_16 - - jmp .L6_4_12 - ALIGN_4 - -.L6_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_4_19 - ALIGN_4 - -.L6_4_17: - - KERNEL4x3_SUB - - jnz .L6_4_17 - ALIGN_4 - - -.L6_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L6_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L6_2_10: - testq $ 2, M - jz .L6_2_40 // to next 2 lines of N - -.L6_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_16 - ALIGN_4 - -.L6_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L6_2_16 - - jmp .L6_2_12 - ALIGN_4 - -.L6_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_19 - ALIGN_4 - -.L6_2_17: - - KERNEL2x3_SUB - - jnz .L6_2_17 - ALIGN_4 - - -.L6_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L6_2_40: - testq $ 1, M - jz .L6_2_60 // to next 2 lines of N - - ALIGN_4 - -.L6_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L6_2_46 - - ALIGN_4 - -.L6_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L6_2_46 - - jmp .L6_2_42 - ALIGN_4 - -.L6_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L6_2_49 - - ALIGN_4 - -.L6_2_47: - - KERNEL1x3_SUB - - jnz .L6_2_47 - ALIGN_4 - - -.L6_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L6_2_41 - ALIGN_4 - - - - -.L6_2_60: - - -/************************************************************************************************/ - -/************************************************************************************************/ - - -.L7_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - salq $2, %rax // 2 * COMPSIZE - leaq (B, %rax,8), BO2 - movq K, %rax - ALIGN_4 - -.L7_00_02b: - - vmovups 2 * SIZE(BO1), %xmm0 - vmovups (BO2), %xmm1 - vmovups 2 * SIZE(BO2), %xmm2 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - vmovups %xmm2, 4 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO2 - addq $ 6*SIZE,BO - decq %rax - jnz .L7_00_02b - -.L7_00_02c: - - movq BO2, B // next offset of B - - -.L7_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - leaq (C, LDC, 1), C // c += 1 * ldc - - movq A, AO // aoffset = a - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L7_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L7_4_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_4_16 - ALIGN_4 - -.L7_4_12: - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - KERNEL4x3_SUB - - je .L7_4_16 - - jmp .L7_4_12 - ALIGN_4 - -.L7_4_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_4_19 - - ALIGN_4 - -.L7_4_17: - - KERNEL4x3_SUB - - jnz .L7_4_17 - ALIGN_4 - - -.L7_4_19: - - SAVE4x3 - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L7_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L7_2_10: - testq $ 2, M - jz .L7_2_40 // to next 2 lines of N - -.L7_2_11: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_16 - ALIGN_4 - -.L7_2_12: - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - KERNEL2x3_SUB - - je .L7_2_16 - - jmp .L7_2_12 - ALIGN_4 - -.L7_2_16: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_19 - - ALIGN_4 - -.L7_2_17: - - KERNEL2x3_SUB - - jnz .L7_2_17 - ALIGN_4 - - -.L7_2_19: - - SAVE2x3 - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L7_2_40: - testq $ 1, M - jz .L7_2_60 // to next 2 lines of N - - ALIGN_4 - -.L7_2_41: - - leaq BUFFER1, BO // first buffer to BO - - vzeroall - - movq K, %rax - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L7_2_46 - - ALIGN_4 - -.L7_2_42: - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - KERNEL1x3_SUB - - je .L7_2_46 - - jmp .L7_2_42 - ALIGN_4 - -.L7_2_46: - movq K, %rax - - andq $ 7, %rax # if (k & 1) - je .L7_2_49 - ALIGN_4 - -.L7_2_47: - - KERNEL1x3_SUB - - jnz .L7_2_47 - ALIGN_4 - - -.L7_2_49: - - SAVE1x3 - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L7_2_41 - ALIGN_4 - - - - -.L7_2_60: - - decq J // j -- - jg .L6_00_01 // next 6 lines of N - -/************************************************************************************************/ - - - -/************************************************************************************************/ -.L2_00_0: - - movq Nmod6, J - sarq $1, J // j = j / 2 - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - - -#else -/************************************************************************************************ - TRMM Kernel -************************************************************************************************/ - - PROLOGUE - PROFCODE - - subq $ STACKSIZE, %rsp - movq %rbx, (%rsp) - movq %rbp, 8(%rsp) - movq %r12, 16(%rsp) - movq %r13, 24(%rsp) - movq %r14, 32(%rsp) - movq %r15, 40(%rsp) - - vzeroupper - -#ifdef WINDOWS_ABI - movq %rdi, 48(%rsp) - movq %rsi, 56(%rsp) - vmovups %xmm6, 64(%rsp) - vmovups %xmm7, 80(%rsp) - vmovups %xmm8, 96(%rsp) - vmovups %xmm9, 112(%rsp) - vmovups %xmm10, 128(%rsp) - vmovups %xmm11, 144(%rsp) - vmovups %xmm12, 160(%rsp) - vmovups %xmm13, 176(%rsp) - vmovups %xmm14, 192(%rsp) - vmovups %xmm15, 208(%rsp) - - movq ARG1, OLD_M - movq ARG2, OLD_N - movq ARG3, OLD_K - movq OLD_A, A - movq OLD_B, B - movq OLD_C, C - movq OLD_LDC, LDC -#ifdef TRMMKERNEL - movsd OLD_OFFSET, %xmm12 -#endif - vmovaps %xmm3, %xmm0 - vmovsd OLD_ALPHA_I, %xmm1 - -#else - movq STACKSIZE + 8(%rsp), LDC -#ifdef TRMMKERNEL - movsd STACKSIZE + 16(%rsp), %xmm12 -#endif - -#endif - - movq %rsp, SP # save old stack - subq $ 128 + L_BUFFER_SIZE, %rsp - andq $ -4096, %rsp # align stack - - STACK_TOUCH - - cmpq $ 0, OLD_M - je .L999 - - cmpq $ 0, OLD_N - je .L999 - - cmpq $ 0, OLD_K - je .L999 - - movq OLD_M, M - movq OLD_N, N - movq OLD_K, K - - vmovsd %xmm0, ALPHA_R - vmovsd %xmm1, ALPHA_I - - salq $ ZBASE_SHIFT, LDC - - movq N, %rax - xorq %rdx, %rdx - movq $ 2, %rdi - divq %rdi // N / 2 - movq %rax, Ndiv6 // N / 2 - movq %rdx, Nmod6 // N % 2 - - - -#ifdef TRMMKERNEL - vmovsd %xmm12, OFFSET - vmovsd %xmm12, KK -#ifndef LEFT - negq KK -#endif -#endif - -.L2_00_0: - - movq Ndiv6, J - cmpq $ 0, J - je .L1_2_0 - ALIGN_4 - - - -.L2_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L2_00_02b: - - vmovups (BO1), %xmm0 - vmovups 2 * SIZE(BO1), %xmm1 - vmovups %xmm0, (BO) - vmovups %xmm1, 2 * SIZE(BO) - addq $ 4*SIZE,BO1 - addq $ 4*SIZE,BO - decq %rax - jnz .L2_00_02b - -.L2_00_02c: - - movq BO1, B // next offset of B - - -.L2_00_10: - movq C, CO1 - leaq (C, LDC, 2), C // c += 2 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L2_2_10 - - ALIGN_4 - -/******************************************************************************************************************/ - -.L2_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI ,SIZE) - KERNEL4x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL4x2_SUB - - je .L2_4_16 - - jmp .L2_4_12 - ALIGN_4 - -.L2_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_4_17: - - KERNEL4x2_SUB - - jl .L2_4_17 - ALIGN_4 - - -.L2_4_19: - - SAVE4x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L2_4_11 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ - - -/******************************************************************************************************************/ -.L2_2_10: - testq $ 2, M - jz .L2_2_40 // to next 2 lines of N - -.L2_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x2_SUB - KERNEL2x2_SUB - - je .L2_2_16 - - jmp .L2_2_12 - ALIGN_4 - -.L2_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_17: - - KERNEL2x2_SUB - - jl .L2_2_17 - ALIGN_4 - - -.L2_2_19: - - SAVE2x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L2_2_40: - testq $ 1, M - jz .L2_2_60 // to next 2 lines of N - - ALIGN_4 - -.L2_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 8 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 2, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L2_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x2_SUB - KERNEL1x2_SUB - - je .L2_2_46 - - jmp .L2_2_42 - ALIGN_4 - -.L2_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L2_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L2_2_47: - - KERNEL1x2_SUB - - jl .L2_2_47 - ALIGN_4 - - -.L2_2_49: - - SAVE1x2 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,4), BI // BI = BI * 4 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L2_2_41 - ALIGN_4 - - - - -.L2_2_60: -#if defined(TRMMKERNEL) && !defined(LEFT) - addq $ 2, KK -#endif - - decq J // j -- - jg .L2_00_01 // next 2 lines of N - - - -.L1_2_0: - -/************************************************************************************************ -* Loop for Nmod6 % 2 > 0 -*************************************************************************************************/ - - movq Nmod6, J - andq $ 1, J // j % 2 - je .L999 - ALIGN_4 - -.L1_00_01: - // copy to sub buffer - movq B, BO1 - leaq BUFFER1, BO // first buffer to BO - movq K, %rax - ALIGN_4 - -.L1_00_02b: - - vmovups (BO1), %xmm0 - vmovups %xmm0, (BO) - addq $ 2*SIZE,BO1 - addq $ 2*SIZE,BO - decq %rax - jnz .L1_00_02b - -.L1_00_02c: - - movq BO1, B // next offset of B - -.L1_00_10: - movq C, CO1 - leaq (C, LDC, 1), C // c += 1 * ldc - -#if defined(TRMMKERNEL) && defined(LEFT) - movq OFFSET, %rax - movq %rax, KK -#endif - - movq A, AO // aoffset = a - addq $ 8 * SIZE, AO - - movq M, I - sarq $ 2, I // i = (m >> 2) - je .L1_2_10 - - ALIGN_4 - -/*******************************************************************************************************/ - - -.L1_4_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 4, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_4_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_12: - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - - je .L1_4_16 - - jmp .L1_4_12 - ALIGN_4 - -.L1_4_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_4_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_4_17: - - KERNEL4x1_SUB - - jl .L1_4_17 - ALIGN_4 - - -.L1_4_19: - - SAVE4x1 - - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 3, %rax // rax = rax * 8 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 4, KK -#endif - - addq $ 8 * SIZE, CO1 # coffset += 8 - decq I # i -- - jg .L1_4_11 - ALIGN_4 - - - - -/*******************************************************************************************************/ -.L1_2_10: - testq $ 2, M - jz .L1_2_40 - - -.L1_2_11: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 2, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_16 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_12: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - prefetcht0 A_PR1(AO,%rax,SIZE) - KERNEL2x1_SUB - KERNEL2x1_SUB - - je .L1_2_16 - - jmp .L1_2_12 - ALIGN_4 - -.L1_2_16: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_19 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_17: - - KERNEL2x1_SUB - - jl .L1_2_17 - ALIGN_4 - - -.L1_2_19: - - SAVE2x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 2, %rax // rax = rax * 4 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 2, KK -#endif - - addq $ 4 * SIZE, CO1 # coffset += 4 - - ALIGN_4 - - -/************************************************************************** -* Rest of M -***************************************************************************/ -.L1_2_40: - testq $ 1, M - jz .L999 - - ALIGN_4 - -.L1_2_41: - -#if !defined(TRMMKERNEL) || \ - (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO -#else - movq KK, %rax - leaq BUFFER1, BO // first buffer to BO - addq $ 4 * SIZE, BO - movq %rax, BI // Index for BO - leaq (,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - vzeroall - -#ifndef TRMMKERNEL - movq K, %rax -#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - movq K, %rax - subq KK, %rax - movq %rax, KKK -#else - movq KK, %rax -#ifdef LEFT - addq $ 1, %rax // number of values in AO -#else - addq $ 1, %rax // number of values in BO -#endif - movq %rax, KKK -#endif - - - andq $ -8, %rax // K = K - ( K % 8 ) - je .L1_2_46 - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_42: - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - prefetcht0 A_PR1(AO,%rax,SIZE) - prefetcht0 B_PR1(BO,BI,SIZE) - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - je .L1_2_46 - - jmp .L1_2_42 - ALIGN_4 - -.L1_2_46: -#ifndef TRMMKERNEL - movq K, %rax -#else - movq KKK, %rax -#endif - - andq $ 7, %rax # if (k & 1) - je .L1_2_49 - - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO - leaq (BO, BI, SIZE), BO - negq BI - negq %rax - ALIGN_4 - -.L1_2_47: - - KERNEL1x1_SUB - - jl .L1_2_47 - ALIGN_4 - - -.L1_2_49: - - SAVE1x1 - -#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ - (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) - movq K, %rax - subq KKK, %rax - movq %rax, BI // Index for BO - leaq ( ,BI,2), BI // BI = BI * 2 ; number of values - leaq (BO, BI, SIZE), BO - salq $ 1, %rax // rax = rax * 2 ; number of values - leaq (AO, %rax, SIZE), AO -#endif - - -#if defined(TRMMKERNEL) && defined(LEFT) - addq $ 1, KK -#endif - - addq $ 2 * SIZE, CO1 # coffset += 2 - decq I # i -- - jg .L1_2_41 - ALIGN_4 - - - - - - -.L999: - vzeroupper - - movq SP, %rsp - movq (%rsp), %rbx - movq 8(%rsp), %rbp - movq 16(%rsp), %r12 - movq 24(%rsp), %r13 - movq 32(%rsp), %r14 - movq 40(%rsp), %r15 - -#ifdef WINDOWS_ABI - movq 48(%rsp), %rdi - movq 56(%rsp), %rsi - vmovups 64(%rsp), %xmm6 - vmovups 80(%rsp), %xmm7 - vmovups 96(%rsp), %xmm8 - vmovups 112(%rsp), %xmm9 - vmovups 128(%rsp), %xmm10 - vmovups 144(%rsp), %xmm11 - vmovups 160(%rsp), %xmm12 - vmovups 176(%rsp), %xmm13 - vmovups 192(%rsp), %xmm14 - vmovups 208(%rsp), %xmm15 -#endif - - addq $ STACKSIZE, %rsp - ret - - EPILOGUE - -#endif - - +/********************************************************************************* +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/******************************************************************************** +* 2014/07/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/10/28 Saar +* Parameter: +* ZGEMM_DEFAULT_UNROLL_N 2 +* ZGEMM_DEFAULT_UNROLL_M 4 +* ZGEMM_DEFAULT_P 256 +* ZGEMM_DEFAULT_Q 128 +* A_PR1 512 +* B_PR1 512 +* +* 2014/07/28 Saar +* Performance at 4608x4608x4608: +* 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53) +* 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100) +* 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138) +* 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172) +* +********************************************************************************/ + + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define J %r14 +#define OLD_K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define K %r12 +#define BI %rbp +#define SP %rbx + +#define BO1 %rdi +#define BO2 %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define L_BUFFER_SIZE 8192 + +#define Ndiv6 24(%rsp) +#define Nmod6 32(%rsp) +#define N 40(%rsp) +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) +#define BUFFER1 128(%rsp) + +#if defined(OS_WINDOWS) +#if L_BUFFER_SIZE > 16384 +#define STACK_TOUCH \ + movl $ 0, 4096 * 4(%rsp);\ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 12288 +#define STACK_TOUCH \ + movl $ 0, 4096 * 3(%rsp);\ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 8192 +#define STACK_TOUCH \ + movl $ 0, 4096 * 2(%rsp);\ + movl $ 0, 4096 * 1(%rsp); +#elif L_BUFFER_SIZE > 4096 +#define STACK_TOUCH \ + movl $ 0, 4096 * 1(%rsp); +#else +#define STACK_TOUCH +#endif +#else +#define STACK_TOUCH +#endif + + +#if defined(BULLDOZER) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0 + +#endif + +#else + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + +#define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#else + +#define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0 + +#endif + +#endif + +#define A_PR1 512 +#define B_PR1 512 + + + +/***************************************************************************************************/ + +.macro KERNEL4x3_SUB + vmovups (AO), %ymm0 + vmovups 4 * SIZE(AO), %ymm1 + prefetcht0 A_PR1(AO) + + vbroadcastsd (BO), %ymm2 + vbroadcastsd 1 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm3,%ymm1 ) + + vbroadcastsd 2 * SIZE(BO), %ymm2 + vbroadcastsd 3 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm10,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm3,%ymm1 ) + + vbroadcastsd 4 * SIZE(BO), %ymm2 + vbroadcastsd 5 * SIZE(BO), %ymm3 + VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 ) + VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 ) + VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 ) + VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 ) + + addq $ 6*SIZE, BO + addq $ 8*SIZE, AO + decq %rax +.endm + +.macro SAVE4x3 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5 + vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + vaddsubpd %ymm4 , %ymm5 ,%ymm5 + vaddsubpd %ymm6 , %ymm7 ,%ymm7 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + vmovapd %ymm5 , %ymm4 + vmovapd %ymm7 , %ymm6 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5 + vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + vmulpd %ymm4 , %ymm0, %ymm4 + vmulpd %ymm6 , %ymm0, %ymm6 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + vmulpd %ymm5 , %ymm1, %ymm5 + vmulpd %ymm7 , %ymm1, %ymm7 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + vaddsubpd %ymm5 ,%ymm4 , %ymm4 + vaddsubpd %ymm7 ,%ymm6 , %ymm6 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + + vaddpd (CO1, LDC,2), %ymm4 , %ymm4 + vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6 +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + vmovups %ymm4 , (CO1, LDC, 2) + vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + + + +/***************************************************************************************************/ + +.macro KERNEL2x3_SUB + vmovups (AO), %xmm0 + vmovups 2 * SIZE(AO), %xmm1 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm3,%xmm1 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm3,%xmm1 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 ) + + addq $ 6*SIZE, BO + addq $ 4*SIZE, AO + decq %rax +.endm + +.macro SAVE2x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + vaddsubpd %xmm4, %xmm5 ,%xmm5 + vaddsubpd %xmm6, %xmm7 ,%xmm7 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + vmovapd %xmm5, %xmm4 + vmovapd %xmm7, %xmm6 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + vmulpd %xmm4 , %xmm0, %xmm4 + vmulpd %xmm6 , %xmm0, %xmm6 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + vmulpd %xmm5 , %xmm1, %xmm5 + vmulpd %xmm7 , %xmm1, %xmm7 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + vaddsubpd %xmm5, %xmm4 , %xmm4 + vaddsubpd %xmm7, %xmm6 , %xmm6 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + + vaddpd (CO1, LDC,2), %xmm4 , %xmm4 + vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + + vmovups %xmm4 , (CO1, LDC,2) + vmovups %xmm6 , 2 * SIZE(CO1, LDC,2) + +.endm + + +/************************************************************************************************/ + + +.macro KERNEL1x3_SUB + vmovups (AO), %xmm0 + vmovddup (BO), %xmm2 + vmovddup 1 * SIZE(BO), %xmm3 + + VFMADDPD_R( %xmm8,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm3,%xmm0 ) + + vmovddup 2 * SIZE(BO), %xmm2 + vmovddup 3 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm10,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm3,%xmm0 ) + + vmovddup 4 * SIZE(BO), %xmm2 + vmovddup 5 * SIZE(BO), %xmm3 + VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 ) + VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 ) + + addq $ 6*SIZE, BO + addq $ 2*SIZE, AO + decq %rax +.endm + +.macro SAVE1x3 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + vaddsubpd %xmm4, %xmm5, %xmm5 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm5, %xmm4 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm4 , %xmm0, %xmm4 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm5 , %xmm1, %xmm5 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm5, %xmm4 , %xmm4 + +#ifndef TRMMKERNEL + + vaddpd (CO1) , %xmm8 , %xmm8 + vaddpd (CO1, LDC) , %xmm10, %xmm10 + vaddpd (CO1, LDC,2) , %xmm4 , %xmm4 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm4 , (CO1, LDC,2) + +.endm + + + + +/***************************************************************************************************/ + +.macro KERNEL4x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + + vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4 + vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6 + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7 + VFMADDPD_R( %ymm10,%ymm6,%ymm0 ) + VFMADDPD_R( %ymm14,%ymm6,%ymm1 ) + VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) + VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) + + addq $ 4, BI + addq $ 8, %rax +.endm + +.macro SAVE4x2 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 + +#else + vaddsubpd %ymm8, %ymm9 ,%ymm9 + vaddsubpd %ymm10, %ymm11,%ymm11 + vaddsubpd %ymm12, %ymm13,%ymm13 + vaddsubpd %ymm14, %ymm15,%ymm15 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm11, %ymm10 + vmovapd %ymm13, %ymm12 + vmovapd %ymm15, %ymm14 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm10, %ymm0, %ymm10 + vmulpd %ymm12, %ymm0, %ymm12 + vmulpd %ymm14, %ymm0, %ymm14 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm11, %ymm1, %ymm11 + vmulpd %ymm13, %ymm1, %ymm13 + vmulpd %ymm15, %ymm1, %ymm15 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm11,%ymm10, %ymm10 + vaddsubpd %ymm13,%ymm12, %ymm12 + vaddsubpd %ymm15,%ymm14, %ymm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + + vaddpd (CO1, LDC), %ymm10, %ymm10 + vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 , 4 * SIZE(CO1) + + vmovups %ymm10 , (CO1, LDC) + vmovups %ymm14 , 4 * SIZE(CO1, LDC) + + prefetcht0 64(CO1) + prefetcht0 64(CO1, LDC) + +.endm + +/***************************************************************************************************/ + +.macro KERNEL2x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_R( %xmm14,%xmm6,%xmm1 ) + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) + addq $ 4, BI + addq $ 4, %rax +.endm + +.macro SAVE2x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 + +#else + vaddsubpd %xmm8, %xmm9 ,%xmm9 + vaddsubpd %xmm10, %xmm11,%xmm11 + vaddsubpd %xmm12, %xmm13,%xmm13 + vaddsubpd %xmm14, %xmm15,%xmm15 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + vmovapd %xmm13, %xmm12 + vmovapd %xmm15, %xmm14 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + vmulpd %xmm12, %xmm0, %xmm12 + vmulpd %xmm14, %xmm0, %xmm14 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + vmulpd %xmm13, %xmm1, %xmm13 + vmulpd %xmm15, %xmm1, %xmm15 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + vaddsubpd %xmm13,%xmm12, %xmm12 + vaddsubpd %xmm15,%xmm14, %xmm14 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + + vaddpd (CO1, LDC), %xmm10, %xmm10 + vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + + vmovups %xmm10 , (CO1, LDC) + vmovups %xmm14 , 2 * SIZE(CO1, LDC) + +.endm + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.macro KERNEL1x2_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 + vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 + vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 + VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) + VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) + addq $ 4, BI + addq $ 2, %rax +.endm + +.macro SAVE1x2 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + vaddsubpd %xmm10,%xmm11, %xmm11 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm11, %xmm10 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm10, %xmm0, %xmm10 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm11, %xmm1, %xmm11 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm11,%xmm10, %xmm10 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd (CO1, LDC), %xmm10, %xmm10 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm10 , (CO1, LDC) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL4x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 + vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1 + vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4 + vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5 + VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 ) + VFMADDPD_R( %ymm12,%ymm4,%ymm1 ) + VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) + VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) + + addq $ 2, BI + addq $ 8, %rax +.endm + +.macro SAVE4x1 + + vbroadcastsd ALPHA_R, %ymm0 + vbroadcastsd ALPHA_I, %ymm1 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13,%ymm12 , %ymm12 + + vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 + vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 + +#else + vaddsubpd %ymm8, %ymm9 , %ymm9 + vaddsubpd %ymm12,%ymm13, %ymm13 + + vmovapd %ymm9, %ymm8 + vmovapd %ymm13, %ymm12 + + // swap high and low 8 bytes + vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 + vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 + +#endif + + // multiply with ALPHA_R + vmulpd %ymm8 , %ymm0, %ymm8 + vmulpd %ymm12, %ymm0, %ymm12 + + // multiply with ALPHA_I + vmulpd %ymm9 , %ymm1, %ymm9 + vmulpd %ymm13, %ymm1, %ymm13 + + vaddsubpd %ymm9, %ymm8 , %ymm8 + vaddsubpd %ymm13, %ymm12, %ymm12 + + + +#ifndef TRMMKERNEL + + vaddpd (CO1), %ymm8 , %ymm8 + vaddpd 4 * SIZE(CO1), %ymm12, %ymm12 + +#endif + + vmovups %ymm8 , (CO1) + vmovups %ymm12 ,4 * SIZE(CO1) + +.endm + + + +/************************************************************************************************/ + +.macro KERNEL2x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 + VFMADDPD_R( %xmm12,%xmm4,%xmm1 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) + addq $ 2, BI + addq $ 4, %rax +.endm + +.macro SAVE2x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13,%xmm12 , %xmm12 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 + +#else + vaddsubpd %xmm8, %xmm9 , %xmm9 + vaddsubpd %xmm12,%xmm13, %xmm13 + + vmovapd %xmm9, %xmm8 + vmovapd %xmm13, %xmm12 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + vmulpd %xmm12, %xmm0, %xmm12 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + vmulpd %xmm13, %xmm1, %xmm13 + + vaddsubpd %xmm9, %xmm8 , %xmm8 + vaddsubpd %xmm13, %xmm12, %xmm12 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 + +#endif + + vmovups %xmm8 , (CO1) + vmovups %xmm12 , 2 * SIZE(CO1) + +.endm + + +/************************************************************************************************/ + +.macro KERNEL1x1_SUB + vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 + vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 + VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) + vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 + VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) + addq $ 2, BI + addq $ 2, %rax +.endm + +.macro SAVE1x1 + + vmovddup ALPHA_R, %xmm0 + vmovddup ALPHA_I, %xmm1 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + vaddsubpd %xmm9, %xmm8, %xmm8 + + vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 + +#else + vaddsubpd %xmm8, %xmm9, %xmm9 + + vmovapd %xmm9, %xmm8 + + // swap high and low 64 bytes + vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 + +#endif + + // multiply with ALPHA_R + vmulpd %xmm8 , %xmm0, %xmm8 + + // multiply with ALPHA_I + vmulpd %xmm9 , %xmm1, %xmm9 + + vaddsubpd %xmm9 ,%xmm8, %xmm8 + +#ifndef TRMMKERNEL + + vaddpd (CO1), %xmm8 , %xmm8 + +#endif + + vmovups %xmm8 , (CO1) + +.endm + + +/************************************************************************************************/ + + + +#if !defined(TRMMKERNEL) + + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 6, %rdi + divq %rdi // N / 6 + movq %rax, Ndiv6 // N / 6 + movq %rdx, Nmod6 // N % 6 + + + +/************************************************************************************************/ +.L6_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L2_00_0 + ALIGN_4 + + + +.L6_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq BO2, B // next offset of B + movq K, %rax + ALIGN_4 + +.L6_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups (BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L6_00_02b + +.L6_00_02c: + + + +.L6_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L6_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L6_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_4_16 + ALIGN_4 + +.L6_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L6_4_16 + + jmp .L6_4_12 + ALIGN_4 + +.L6_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_4_19 + ALIGN_4 + +.L6_4_17: + + KERNEL4x3_SUB + + jnz .L6_4_17 + ALIGN_4 + + +.L6_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L6_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L6_2_10: + testq $ 2, M + jz .L6_2_40 // to next 2 lines of N + +.L6_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_16 + ALIGN_4 + +.L6_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L6_2_16 + + jmp .L6_2_12 + ALIGN_4 + +.L6_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_19 + ALIGN_4 + +.L6_2_17: + + KERNEL2x3_SUB + + jnz .L6_2_17 + ALIGN_4 + + +.L6_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L6_2_40: + testq $ 1, M + jz .L6_2_60 // to next 2 lines of N + + ALIGN_4 + +.L6_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L6_2_46 + + ALIGN_4 + +.L6_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L6_2_46 + + jmp .L6_2_42 + ALIGN_4 + +.L6_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L6_2_49 + + ALIGN_4 + +.L6_2_47: + + KERNEL1x3_SUB + + jnz .L6_2_47 + ALIGN_4 + + +.L6_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L6_2_41 + ALIGN_4 + + + + +.L6_2_60: + + +/************************************************************************************************/ + +/************************************************************************************************/ + + +.L7_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + salq $2, %rax // 2 * COMPSIZE + leaq (B, %rax,8), BO2 + movq K, %rax + ALIGN_4 + +.L7_00_02b: + + vmovups 2 * SIZE(BO1), %xmm0 + vmovups (BO2), %xmm1 + vmovups 2 * SIZE(BO2), %xmm2 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + vmovups %xmm2, 4 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO2 + addq $ 6*SIZE,BO + decq %rax + jnz .L7_00_02b + +.L7_00_02c: + + movq BO2, B // next offset of B + + +.L7_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + leaq (C, LDC, 1), C // c += 1 * ldc + + movq A, AO // aoffset = a + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L7_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L7_4_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_4_16 + ALIGN_4 + +.L7_4_12: + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + KERNEL4x3_SUB + + je .L7_4_16 + + jmp .L7_4_12 + ALIGN_4 + +.L7_4_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_4_19 + + ALIGN_4 + +.L7_4_17: + + KERNEL4x3_SUB + + jnz .L7_4_17 + ALIGN_4 + + +.L7_4_19: + + SAVE4x3 + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L7_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L7_2_10: + testq $ 2, M + jz .L7_2_40 // to next 2 lines of N + +.L7_2_11: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_16 + ALIGN_4 + +.L7_2_12: + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + KERNEL2x3_SUB + + je .L7_2_16 + + jmp .L7_2_12 + ALIGN_4 + +.L7_2_16: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_19 + + ALIGN_4 + +.L7_2_17: + + KERNEL2x3_SUB + + jnz .L7_2_17 + ALIGN_4 + + +.L7_2_19: + + SAVE2x3 + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L7_2_40: + testq $ 1, M + jz .L7_2_60 // to next 2 lines of N + + ALIGN_4 + +.L7_2_41: + + leaq BUFFER1, BO // first buffer to BO + + vzeroall + + movq K, %rax + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L7_2_46 + + ALIGN_4 + +.L7_2_42: + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + KERNEL1x3_SUB + + je .L7_2_46 + + jmp .L7_2_42 + ALIGN_4 + +.L7_2_46: + movq K, %rax + + andq $ 7, %rax # if (k & 1) + je .L7_2_49 + ALIGN_4 + +.L7_2_47: + + KERNEL1x3_SUB + + jnz .L7_2_47 + ALIGN_4 + + +.L7_2_49: + + SAVE1x3 + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L7_2_41 + ALIGN_4 + + + + +.L7_2_60: + + decq J // j -- + jg .L6_00_01 // next 6 lines of N + +/************************************************************************************************/ + + + +/************************************************************************************************/ +.L2_00_0: + + movq Nmod6, J + sarq $1, J // j = j / 2 + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + + +#else +/************************************************************************************************ + TRMM Kernel +************************************************************************************************/ + + PROLOGUE + PROFCODE + + subq $ STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + vmovups %xmm6, 64(%rsp) + vmovups %xmm7, 80(%rsp) + vmovups %xmm8, 96(%rsp) + vmovups %xmm9, 112(%rsp) + vmovups %xmm10, 128(%rsp) + vmovups %xmm11, 144(%rsp) + vmovups %xmm12, 160(%rsp) + vmovups %xmm13, 176(%rsp) + vmovups %xmm14, 192(%rsp) + vmovups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + vmovsd OLD_ALPHA_I, %xmm1 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, SP # save old stack + subq $ 128 + L_BUFFER_SIZE, %rsp + andq $ -4096, %rsp # align stack + + STACK_TOUCH + + cmpq $ 0, OLD_M + je .L999 + + cmpq $ 0, OLD_N + je .L999 + + cmpq $ 0, OLD_K + je .L999 + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + vmovsd %xmm0, ALPHA_R + vmovsd %xmm1, ALPHA_I + + salq $ ZBASE_SHIFT, LDC + + movq N, %rax + xorq %rdx, %rdx + movq $ 2, %rdi + divq %rdi // N / 2 + movq %rax, Ndiv6 // N / 2 + movq %rdx, Nmod6 // N % 2 + + + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + +.L2_00_0: + + movq Ndiv6, J + cmpq $ 0, J + je .L1_2_0 + ALIGN_4 + + + +.L2_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L2_00_02b: + + vmovups (BO1), %xmm0 + vmovups 2 * SIZE(BO1), %xmm1 + vmovups %xmm0, (BO) + vmovups %xmm1, 2 * SIZE(BO) + addq $ 4*SIZE,BO1 + addq $ 4*SIZE,BO + decq %rax + jnz .L2_00_02b + +.L2_00_02c: + + movq BO1, B // next offset of B + + +.L2_00_10: + movq C, CO1 + leaq (C, LDC, 2), C // c += 2 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L2_2_10 + + ALIGN_4 + +/******************************************************************************************************************/ + +.L2_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI ,SIZE) + KERNEL4x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL4x2_SUB + + je .L2_4_16 + + jmp .L2_4_12 + ALIGN_4 + +.L2_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_4_17: + + KERNEL4x2_SUB + + jl .L2_4_17 + ALIGN_4 + + +.L2_4_19: + + SAVE4x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L2_4_11 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ + + +/******************************************************************************************************************/ +.L2_2_10: + testq $ 2, M + jz .L2_2_40 // to next 2 lines of N + +.L2_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x2_SUB + KERNEL2x2_SUB + + je .L2_2_16 + + jmp .L2_2_12 + ALIGN_4 + +.L2_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_17: + + KERNEL2x2_SUB + + jl .L2_2_17 + ALIGN_4 + + +.L2_2_19: + + SAVE2x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L2_2_40: + testq $ 1, M + jz .L2_2_60 // to next 2 lines of N + + ALIGN_4 + +.L2_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 8 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 2, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L2_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x2_SUB + KERNEL1x2_SUB + + je .L2_2_46 + + jmp .L2_2_42 + ALIGN_4 + +.L2_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L2_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L2_2_47: + + KERNEL1x2_SUB + + jl .L2_2_47 + ALIGN_4 + + +.L2_2_49: + + SAVE1x2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,4), BI // BI = BI * 4 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L2_2_41 + ALIGN_4 + + + + +.L2_2_60: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $ 2, KK +#endif + + decq J // j -- + jg .L2_00_01 // next 2 lines of N + + + +.L1_2_0: + +/************************************************************************************************ +* Loop for Nmod6 % 2 > 0 +*************************************************************************************************/ + + movq Nmod6, J + andq $ 1, J // j % 2 + je .L999 + ALIGN_4 + +.L1_00_01: + // copy to sub buffer + movq B, BO1 + leaq BUFFER1, BO // first buffer to BO + movq K, %rax + ALIGN_4 + +.L1_00_02b: + + vmovups (BO1), %xmm0 + vmovups %xmm0, (BO) + addq $ 2*SIZE,BO1 + addq $ 2*SIZE,BO + decq %rax + jnz .L1_00_02b + +.L1_00_02c: + + movq BO1, B // next offset of B + +.L1_00_10: + movq C, CO1 + leaq (C, LDC, 1), C // c += 1 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO // aoffset = a + addq $ 8 * SIZE, AO + + movq M, I + sarq $ 2, I // i = (m >> 2) + je .L1_2_10 + + ALIGN_4 + +/*******************************************************************************************************/ + + +.L1_4_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 4, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_4_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_12: + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + je .L1_4_16 + + jmp .L1_4_12 + ALIGN_4 + +.L1_4_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_4_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_4_17: + + KERNEL4x1_SUB + + jl .L1_4_17 + ALIGN_4 + + +.L1_4_19: + + SAVE4x1 + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 3, %rax // rax = rax * 8 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 4, KK +#endif + + addq $ 8 * SIZE, CO1 # coffset += 8 + decq I # i -- + jg .L1_4_11 + ALIGN_4 + + + + +/*******************************************************************************************************/ +.L1_2_10: + testq $ 2, M + jz .L1_2_40 + + +.L1_2_11: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 2, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_16 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_12: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + prefetcht0 A_PR1(AO,%rax,SIZE) + KERNEL2x1_SUB + KERNEL2x1_SUB + + je .L1_2_16 + + jmp .L1_2_12 + ALIGN_4 + +.L1_2_16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_19 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_17: + + KERNEL2x1_SUB + + jl .L1_2_17 + ALIGN_4 + + +.L1_2_19: + + SAVE2x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 2, %rax // rax = rax * 4 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 2, KK +#endif + + addq $ 4 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + + +/************************************************************************** +* Rest of M +***************************************************************************/ +.L1_2_40: + testq $ 1, M + jz .L999 + + ALIGN_4 + +.L1_2_41: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO +#else + movq KK, %rax + leaq BUFFER1, BO // first buffer to BO + addq $ 4 * SIZE, BO + movq %rax, BI // Index for BO + leaq (,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + vzeroall + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $ 1, %rax // number of values in AO +#else + addq $ 1, %rax // number of values in BO +#endif + movq %rax, KKK +#endif + + + andq $ -8, %rax // K = K - ( K % 8 ) + je .L1_2_46 + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_42: + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + prefetcht0 A_PR1(AO,%rax,SIZE) + prefetcht0 B_PR1(BO,BI,SIZE) + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + je .L1_2_46 + + jmp .L1_2_42 + ALIGN_4 + +.L1_2_46: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + andq $ 7, %rax # if (k & 1) + je .L1_2_49 + + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO + leaq (BO, BI, SIZE), BO + negq BI + negq %rax + ALIGN_4 + +.L1_2_47: + + KERNEL1x1_SUB + + jl .L1_2_47 + ALIGN_4 + + +.L1_2_49: + + SAVE1x1 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + movq %rax, BI // Index for BO + leaq ( ,BI,2), BI // BI = BI * 2 ; number of values + leaq (BO, BI, SIZE), BO + salq $ 1, %rax // rax = rax * 2 ; number of values + leaq (AO, %rax, SIZE), AO +#endif + + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $ 1, KK +#endif + + addq $ 2 * SIZE, CO1 # coffset += 2 + decq I # i -- + jg .L1_2_41 + ALIGN_4 + + + + + + +.L999: + vzeroupper + + movq SP, %rsp + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + vmovups 64(%rsp), %xmm6 + vmovups 80(%rsp), %xmm7 + vmovups 96(%rsp), %xmm8 + vmovups 112(%rsp), %xmm9 + vmovups 128(%rsp), %xmm10 + vmovups 144(%rsp), %xmm11 + vmovups 160(%rsp), %xmm12 + vmovups 176(%rsp), %xmm13 + vmovups 192(%rsp), %xmm14 + vmovups 208(%rsp), %xmm15 +#endif + + addq $ STACKSIZE, %rsp + ret + + EPILOGUE + +#endif + + diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 2d6866a78b..8fc9606101 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -25,10 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_n_microk_haswell-4.c" @@ -231,10 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT xbuffer[8],*ybuffer; -#if 0 -printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y); -#endif - if ( m < 1 ) return(0); if ( n < 1 ) return(0); diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index c2791e0f3e..63c8b11a41 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" +#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) +#pragma GCC optimize("no-tree-vectorize") +#endif #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 3744c98bb7..45e3531b8c 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -25,10 +25,25 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "zscal_microk_skylakex-2.c" +#elif defined(HASWELL) || defined(ZEN) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c new file mode 100644 index 0000000000..f9e05e333a --- /dev/null +++ b/kernel/x86_64/zscal_microk_skylakex-2.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#include + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + /* _mm512_addsub_pd does not exist so we flip signs for odd elements of da_i */ + __m512d da_r = _mm512_set1_pd(alpha[0]); + __m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m512d x0 = _mm512_loadu_pd(&x[i + 0]); + __m512d x1 = _mm512_loadu_pd(&x[i + 8]); + __m512d y0 = _mm512_permute_pd(x0, 0x55); + __m512d y1 = _mm512_permute_pd(x1, 0x55); + _mm512_storeu_pd(&x[i + 0], _mm512_add_pd(da_r * x0, da_i * y0)); + _mm512_storeu_pd(&x[i + 8], _mm512_add_pd(da_r * x1, da_i * y1)); + } +#else + __m256d da_r = _mm256_set1_pd(alpha[0]); + __m256d da_i = _mm256_set1_pd(alpha[1]); + for (; i < n2; i += 16) { + __m256d x0 = _mm256_loadu_pd(&x[i + 0]); + __m256d x1 = _mm256_loadu_pd(&x[i + 4]); + __m256d x2 = _mm256_loadu_pd(&x[i + 8]); + __m256d x3 = _mm256_loadu_pd(&x[i + 12]); + __m256d y0 = _mm256_permute_pd(x0, 0x05); + __m256d y1 = _mm256_permute_pd(x1, 0x05); + __m256d y2 = _mm256_permute_pd(x2, 0x05); + __m256d y3 = _mm256_permute_pd(x3, 0x05); + _mm256_storeu_pd(&x[i + 0], _mm256_addsub_pd(da_r * x0, da_i * y0)); + _mm256_storeu_pd(&x[i + 4], _mm256_addsub_pd(da_r * x1, da_i * y1)); + _mm256_storeu_pd(&x[i + 8], _mm256_addsub_pd(da_r * x2, da_i * y2)); + _mm256_storeu_pd(&x[i + 12], _mm256_addsub_pd(da_r * x3, da_i * y3)); + } +#endif +} + + +static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m512d y0 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 0]), 0x55); + __m512d y1 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 8]), 0x55); + _mm512_storeu_pd(&x[i + 0], da_i * y0); + _mm512_storeu_pd(&x[i + 8], da_i * y1); + } +#else + __m256d da_i = _mm256_set1_pd(alpha[1]) * _mm256_set_pd(1, -1, 1, -1); + for (; i < n2; i += 16) { + __m256d y0 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 0]), 0x05); + __m256d y1 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 8]), 0x05); + __m256d y2 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 16]), 0x05); + __m256d y3 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 24]), 0x05); + _mm256_storeu_pd(&x[i + 0], da_i * y0); + _mm256_storeu_pd(&x[i + 4], da_i * y1); + _mm256_storeu_pd(&x[i + 8], da_i * y2); + _mm256_storeu_pd(&x[i + 12], da_i * y3); + } +#endif +} + + +static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + +#ifdef __AVX512CD__ + __m512d da_r = _mm512_set1_pd(alpha[0]); + for (; i < n2; i += 16) { + _mm512_storeu_pd(&x[i + 0], da_r * _mm512_loadu_pd(&x[i + 0])); + _mm512_storeu_pd(&x[i + 8], da_r * _mm512_loadu_pd(&x[i + 8])); + } +#else + __m256d da_r = _mm256_set1_pd(alpha[0]); + for (; i < n2; i += 16) { + _mm256_storeu_pd(&x[i + 0], da_r * _mm256_loadu_pd(&x[i + 0])); + _mm256_storeu_pd(&x[i + 4], da_r * _mm256_loadu_pd(&x[i + 4])); + _mm256_storeu_pd(&x[i + 8], da_r * _mm256_loadu_pd(&x[i + 8])); + _mm256_storeu_pd(&x[i + 12], da_r * _mm256_loadu_pd(&x[i + 12])); + } +#endif +} + + +static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + BLASLONG i = 0; + BLASLONG n2 = n + n; + + /* question to self: Why is this not just memset() */ + +#ifdef __AVX512CD__ + __m512d zero = _mm512_setzero_pd(); + for (; i < n2; i += 16) { + _mm512_storeu_pd(&x[i], zero); + _mm512_storeu_pd(&x[i + 8], zero); + } +#else + __m256d zero = _mm256_setzero_pd(); + for (; i < n2; i += 16) { + _mm256_storeu_pd(&x[i + 0], zero); + _mm256_storeu_pd(&x[i + 4], zero); + _mm256_storeu_pd(&x[i + 8], zero); + _mm256_storeu_pd(&x[i + 12], zero); + } +#endif + +} + +#else +#include "zscal_microk_haswell-2.c" +#endif diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c index 83ef3e0d85..b274af2928 100644 --- a/lapack-netlib/INSTALL/ilaver.c +++ b/lapack-netlib/INSTALL/ilaver.c @@ -573,7 +573,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* ===================================================================== */ *vers_major__ = 3; - *vers_minor__ = 9; + *vers_minor__ = 11; *vers_patch__ = 0; /* ===================================================================== */ diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f index 79fe597ae7..a246c37cbd 100644 --- a/lapack-netlib/INSTALL/ilaver.f +++ b/lapack-netlib/INSTALL/ilaver.f @@ -60,7 +60,7 @@ SUBROUTINE ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH * ===================================================================== VERS_MAJOR = 3 - VERS_MINOR = 9 + VERS_MINOR = 11 VERS_PATCH = 0 * ===================================================================== * diff --git a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c index 1c027f8623..a174fcaf02 100644 --- a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c @@ -49,11 +49,9 @@ LAPACKE_dgels (row-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ /* Calling DGELS using row-major layout */ @@ -66,8 +64,8 @@ int main (int argc, const char * argv[]) { /* Locals */ - double A[5][3] = {1,1,1,2,3,4,3,5,2,4,2,5,5,4,3}; - double b[5][2] = {-10,-3,12,14,14,12,16,16,18,16}; + double A[5][3] = {{1,1,1},{2,3,4},{3,5,2},{4,2,5},{5,4,3}}; + double b[5][2] = {{-10,-3},{12,14},{14,12},{16,16},{18,16}}; lapack_int info,m,n,lda,ldb,nrhs; /* Initialization */ diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c index c8bdd6e4e2..44a470d477 100644 --- a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c @@ -25,11 +25,9 @@ LAPACKE_dgesv (col-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ /* Includes */ #include @@ -94,7 +92,7 @@ int main(int argc, char **argv) { /* Check for the exact singularity */ if( info > 0 ) { printf( "The diagonal element of the triangular factor of A,\n" ); - printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); + printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); printf( "the solution could not be computed.\n" ); exit( 1 ); } diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c index 35bdcbcae2..5411ef0490 100644 --- a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c +++ b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c @@ -25,11 +25,9 @@ LAPACKE_dgesv (row-major, high-level) Example Program Results - -- LAPACKE Example routine (version 3.7.0) -- + -- LAPACKE Example routine -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - December 2016 - */ #include #include @@ -91,7 +89,7 @@ int main(int argc, char **argv) { /* Check for the exact singularity */ if( info > 0 ) { printf( "The diagonal element of the triangular factor of A,\n" ); - printf( "U(%i,%i) is zero, so that A is singular;\n", info, info ); + printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info ); printf( "the solution could not be computed.\n" ); exit( 1 ); } diff --git a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c index 9b72eb6209..19fff79055 100644 --- a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c +++ b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c @@ -28,6 +28,6 @@ void print_matrix_colmajor( char* desc, lapack_int m, lapack_int n, double* mat, void print_vector( char* desc, lapack_int n, lapack_int* vec ) { lapack_int j; printf( "\n %s\n", desc ); - for( j = 0; j < n; j++ ) printf( " %6i", vec[j] ); + for( j = 0; j < n; j++ ) printf( " %6" LAPACK_IFMT, vec[j] ); printf( "\n" ); } diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index e4c39e1eea..3e7f9de5b6 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,11 +12,14 @@ #include #include +#include /* It seems all current Fortran compilers put strlen at end. * Some historical compilers put strlen after the str argument * or make the str argument into a struct. */ +#ifndef __EMSCRIPTEN__ #define LAPACK_FORTRAN_STRLEN_END +#endif /* Complex types are structures equivalent to the * Fortran complex types COMPLEX(4) and COMPLEX(8). @@ -80,11 +83,26 @@ extern "C" { /*----------------------------------------------------------------------------*/ #ifndef lapack_int -#define lapack_int int +#if defined(LAPACK_ILP64) +#define lapack_int int64_t +#else +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 +#endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif /* f2c, hence clapack and MacOS Accelerate, returns double instead of float @@ -115,8 +133,10 @@ typedef lapack_logical (*LAPACK_Z_SELECT2) ( const lapack_complex_double*, const lapack_complex_double* ); #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME) -lapack_logical LAPACK_lsame_base( char* ca, char* cb, - lapack_int lca, lapack_int lcb +lapack_logical LAPACK_lsame_base( const char* ca, const char* cb +#ifndef __EMSCRIPTEN__ + , lapack_int lca, lapack_int lcb +#endif #ifdef LAPACK_FORTRAN_STRLEN_END , size_t, size_t #endif @@ -3304,28 +3324,28 @@ void LAPACK_zgesdd_base( #endif #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) -void LAPACK_cgesv( +lapack_int LAPACK_cgesv( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, lapack_complex_float* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV) -void LAPACK_dgesv( +lapack_int LAPACK_dgesv( lapack_int const* n, lapack_int const* nrhs, double* A, lapack_int const* lda, lapack_int* ipiv, double* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV) -void LAPACK_sgesv( +lapack_int LAPACK_sgesv( lapack_int const* n, lapack_int const* nrhs, float* A, lapack_int const* lda, lapack_int* ipiv, float* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV) -void LAPACK_zgesv( +lapack_int LAPACK_zgesv( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, lapack_complex_double* B, lapack_int const* ldb, @@ -3924,49 +3944,49 @@ void LAPACK_zgesvxx_base( #endif #define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2) -void LAPACK_cgetf2( +lapack_int LAPACK_cgetf2( lapack_int const* m, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2) -void LAPACK_dgetf2( +lapack_int LAPACK_dgetf2( lapack_int const* m, lapack_int const* n, double* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2) -void LAPACK_sgetf2( +lapack_int LAPACK_sgetf2( lapack_int const* m, lapack_int const* n, float* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2) -void LAPACK_zgetf2( +lapack_int LAPACK_zgetf2( lapack_int const* m, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF) -void LAPACK_cgetrf( +lapack_int LAPACK_cgetrf( lapack_int const* m, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF) -void LAPACK_dgetrf( +lapack_int LAPACK_dgetrf( lapack_int const* m, lapack_int const* n, double* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF) -void LAPACK_sgetrf( +lapack_int LAPACK_sgetrf( lapack_int const* m, lapack_int const* n, float* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); #define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF) -void LAPACK_zgetrf( +lapack_int LAPACK_zgetrf( lapack_int const* m, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv, lapack_int* info ); @@ -4024,7 +4044,7 @@ void LAPACK_zgetri( lapack_int* info ); #define LAPACK_cgetrs_base LAPACK_GLOBAL(cgetrs,CGETRS) -void LAPACK_cgetrs_base( +lapack_int LAPACK_cgetrs_base( char const* trans, lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv, @@ -4041,7 +4061,7 @@ void LAPACK_cgetrs_base( #endif #define LAPACK_dgetrs_base LAPACK_GLOBAL(dgetrs,DGETRS) -void LAPACK_dgetrs_base( +lapack_int LAPACK_dgetrs_base( char const* trans, lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, lapack_int const* ipiv, @@ -4058,7 +4078,7 @@ void LAPACK_dgetrs_base( #endif #define LAPACK_sgetrs_base LAPACK_GLOBAL(sgetrs,SGETRS) -void LAPACK_sgetrs_base( +lapack_int LAPACK_sgetrs_base( char const* trans, lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, lapack_int const* ipiv, @@ -4075,7 +4095,7 @@ void LAPACK_sgetrs_base( #endif #define LAPACK_zgetrs_base LAPACK_GLOBAL(zgetrs,ZGETRS) -void LAPACK_zgetrs_base( +lapack_int LAPACK_zgetrs_base( char const* trans, lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv, @@ -5268,7 +5288,7 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_cggsvd_base LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACK_cggsvd_base( +void LAPACK_cggsvd_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, @@ -5291,7 +5311,7 @@ lapack_int LAPACK_cggsvd_base( #endif #define LAPACK_sggsvd_base LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACK_sggsvd_base( +void LAPACK_sggsvd_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, @@ -5313,7 +5333,7 @@ lapack_int LAPACK_sggsvd_base( #endif #define LAPACK_dggsvd_base LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACK_dggsvd_base( +void LAPACK_dggsvd_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, @@ -5335,7 +5355,7 @@ lapack_int LAPACK_dggsvd_base( #endif #define LAPACK_zggsvd_base LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACK_zggsvd_base( +void LAPACK_zggsvd_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, @@ -5456,7 +5476,7 @@ void LAPACK_zggsvd3_base( #endif #define LAPACK_sggsvp_base LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACK_sggsvp_base( +void LAPACK_sggsvp_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* p, lapack_int const* n, float* a, lapack_int const* lda, @@ -5479,7 +5499,7 @@ lapack_int LAPACK_sggsvp_base( #endif #define LAPACK_dggsvp_base LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACK_dggsvp_base( +void LAPACK_dggsvp_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* p, lapack_int const* n, double* a, lapack_int const* lda, @@ -5502,7 +5522,7 @@ lapack_int LAPACK_dggsvp_base( #endif #define LAPACK_cggsvp_base LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACK_cggsvp_base( +void LAPACK_cggsvp_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_complex_float* a, lapack_int const* lda, @@ -5524,7 +5544,7 @@ lapack_int LAPACK_cggsvp_base( #endif #define LAPACK_zggsvp_base LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACK_zggsvp_base( +void LAPACK_zggsvp_base( char const* jobu, char const* jobv, char const* jobq, lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_complex_double* a, lapack_int const* lda, @@ -10926,22 +10946,22 @@ void LAPACK_zlassq( double* sumsq ); #define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP) -void LAPACK_claswp( +lapack_int LAPACK_claswp( lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); #define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP) -void LAPACK_dlaswp( +lapack_int LAPACK_dlaswp( lapack_int const* n, double* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); #define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP) -void LAPACK_slaswp( +lapack_int LAPACK_slaswp( lapack_int const* n, float* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); #define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP) -void LAPACK_zlaswp( +lapack_int LAPACK_zlaswp( lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx ); @@ -11034,7 +11054,7 @@ void LAPACK_zlatms_base( #endif #define LAPACK_clauum_base LAPACK_GLOBAL(clauum,CLAUUM) -void LAPACK_clauum_base( +lapack_int LAPACK_clauum_base( char const* uplo, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, @@ -11050,7 +11070,7 @@ void LAPACK_clauum_base( #endif #define LAPACK_dlauum_base LAPACK_GLOBAL(dlauum,DLAUUM) -void LAPACK_dlauum_base( +lapack_int LAPACK_dlauum_base( char const* uplo, lapack_int const* n, double* A, lapack_int const* lda, @@ -11066,7 +11086,7 @@ void LAPACK_dlauum_base( #endif #define LAPACK_slauum_base LAPACK_GLOBAL(slauum,SLAUUM) -void LAPACK_slauum_base( +lapack_int LAPACK_slauum_base( char const* uplo, lapack_int const* n, float* A, lapack_int const* lda, @@ -11082,7 +11102,7 @@ void LAPACK_slauum_base( #endif #define LAPACK_zlauum_base LAPACK_GLOBAL(zlauum,ZLAUUM) -void LAPACK_zlauum_base( +lapack_int LAPACK_zlauum_base( char const* uplo, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, @@ -11098,7 +11118,7 @@ void LAPACK_zlauum_base( #endif #define LAPACK_ilaver LAPACK_GLOBAL(ilaver,ILAVER) -void LAPACK_ilaver( +lapack_int LAPACK_ilaver( lapack_int* vers_major, lapack_int* vers_minor, lapack_int* vers_patch ); #define LAPACK_dopgtr_base LAPACK_GLOBAL(dopgtr,DOPGTR) @@ -13359,7 +13379,7 @@ void LAPACK_zpotf2_base( #endif #define LAPACK_cpotrf_base LAPACK_GLOBAL(cpotrf,CPOTRF) -void LAPACK_cpotrf_base( +lapack_int LAPACK_cpotrf_base( char const* uplo, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, @@ -13375,7 +13395,7 @@ void LAPACK_cpotrf_base( #endif #define LAPACK_dpotrf_base LAPACK_GLOBAL(dpotrf,DPOTRF) -void LAPACK_dpotrf_base( +lapack_int LAPACK_dpotrf_base( char const* uplo, lapack_int const* n, double* A, lapack_int const* lda, @@ -13391,7 +13411,7 @@ void LAPACK_dpotrf_base( #endif #define LAPACK_spotrf_base LAPACK_GLOBAL(spotrf,SPOTRF) -void LAPACK_spotrf_base( +lapack_int LAPACK_spotrf_base( char const* uplo, lapack_int const* n, float* A, lapack_int const* lda, @@ -13407,7 +13427,7 @@ void LAPACK_spotrf_base( #endif #define LAPACK_zpotrf_base LAPACK_GLOBAL(zpotrf,ZPOTRF) -void LAPACK_zpotrf_base( +lapack_int LAPACK_zpotrf_base( char const* uplo, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, @@ -21986,8 +22006,86 @@ void LAPACK_ztrsyl_base( #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__) #endif +#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3) +void LAPACK_ctrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_float const* A, lapack_int const* lda, + lapack_complex_float const* B, lapack_int const* ldb, + lapack_complex_float* C, lapack_int const* ldc, float* scale, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3) +void LAPACK_dtrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + double const* A, lapack_int const* lda, + double const* B, lapack_int const* ldb, + double* C, lapack_int const* ldc, double* scale, + lapack_int* iwork, lapack_int const* liwork, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3) +void LAPACK_strsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + float const* A, lapack_int const* lda, + float const* B, lapack_int const* ldb, + float* C, lapack_int const* ldc, float* scale, + lapack_int* iwork, lapack_int const* liwork, + float* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__) +#endif + +#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3) +void LAPACK_ztrsyl3_base( + char const* trana, char const* tranb, + lapack_int const* isgn, lapack_int const* m, lapack_int const* n, + lapack_complex_double const* A, lapack_int const* lda, + lapack_complex_double const* B, lapack_int const* ldb, + lapack_complex_double* C, lapack_int const* ldc, double* scale, + double* swork, lapack_int const *ldswork, + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1) +#else + #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__) +#endif + #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI) -void LAPACK_ctrtri_base( +lapack_int LAPACK_ctrtri_base( char const* uplo, char const* diag, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, @@ -22003,7 +22101,7 @@ void LAPACK_ctrtri_base( #endif #define LAPACK_dtrtri_base LAPACK_GLOBAL(dtrtri,DTRTRI) -void LAPACK_dtrtri_base( +lapack_int LAPACK_dtrtri_base( char const* uplo, char const* diag, lapack_int const* n, double* A, lapack_int const* lda, @@ -22019,7 +22117,7 @@ void LAPACK_dtrtri_base( #endif #define LAPACK_strtri_base LAPACK_GLOBAL(strtri,STRTRI) -void LAPACK_strtri_base( +lapack_int LAPACK_strtri_base( char const* uplo, char const* diag, lapack_int const* n, float* A, lapack_int const* lda, @@ -22035,7 +22133,7 @@ void LAPACK_strtri_base( #endif #define LAPACK_ztrtri_base LAPACK_GLOBAL(ztrtri,ZTRTRI) -void LAPACK_ztrtri_base( +lapack_int LAPACK_ztrtri_base( char const* uplo, char const* diag, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, @@ -22051,7 +22149,7 @@ void LAPACK_ztrtri_base( #endif #define LAPACK_ctrtrs_base LAPACK_GLOBAL(ctrtrs,CTRTRS) -void LAPACK_ctrtrs_base( +lapack_int LAPACK_ctrtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, @@ -22068,7 +22166,7 @@ void LAPACK_ctrtrs_base( #endif #define LAPACK_dtrtrs_base LAPACK_GLOBAL(dtrtrs,DTRTRS) -void LAPACK_dtrtrs_base( +lapack_int LAPACK_dtrtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, @@ -22085,7 +22183,7 @@ void LAPACK_dtrtrs_base( #endif #define LAPACK_strtrs_base LAPACK_GLOBAL(strtrs,STRTRS) -void LAPACK_strtrs_base( +lapack_int LAPACK_strtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, @@ -22102,7 +22200,7 @@ void LAPACK_strtrs_base( #endif #define LAPACK_ztrtrs_base LAPACK_GLOBAL(ztrtrs,ZTRTRS) -void LAPACK_ztrtrs_base( +lapack_int LAPACK_ztrtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index f6fbfcc33b..9998b15047 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n, float LAPACKE_slamch( char cmach ); double LAPACKE_dlamch( char cmach ); +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ); +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ); +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ); +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ); + float LAPACKE_slange( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda ); double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m, @@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ); +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ); +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ); + lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n, @@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z ); float LAPACKE_slamch_work( char cmach ); double LAPACKE_dlamch_work( char cmach ); +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ); +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ); +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ); +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ); + float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* work ); @@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb, lapack_complex_double* c, lapack_int ldc, double* scale ); +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, + float* c, lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ); +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, + double* c, lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ); +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ); +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ); + lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag, lapack_int n, float* a, lapack_int lda ); lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag, diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 4a7d15760c..c64fc4416a 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -42,17 +42,29 @@ extern "C" { #include #include +#include #ifndef lapack_int #if defined(LAPACK_ILP64) -#define lapack_int int64_t +#define lapack_int int64_t #else -#define lapack_int int32_t +#define lapack_int int32_t +#endif +#endif + +/* + * Integer format string + */ +#ifndef LAPACK_IFMT +#if defined(LAPACK_ILP64) +#define LAPACK_IFMT PRId64 +#else +#define LAPACK_IFMT PRId32 #endif #endif #ifndef lapack_logical -#define lapack_logical lapack_int +#define lapack_logical lapack_int #endif #ifndef LAPACK_COMPLEX_CUSTOM diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index f84604e8a3..332a5024fb 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -68,7 +68,7 @@ void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ lapack_logical LAPACKE_lsame( char ca, char cb ) #if defined __GNUC__ - __attribute__((const)) + __attribute__((const)) #endif ; @@ -128,6 +128,10 @@ void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_float *in, lapack_int ldin, lapack_complex_float *out, lapack_int ldout ); +void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *in, lapack_int ldin, + lapack_complex_float *out, lapack_int ldout ); void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -178,6 +182,10 @@ void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const double *in, lapack_int ldin, double *out, lapack_int ldout ); +void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *in, lapack_int ldin, + double *out, lapack_int ldout ); void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -228,6 +236,10 @@ void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n, const float *in, lapack_int ldin, float *out, lapack_int ldout ); +void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *in, lapack_int ldin, + float *out, lapack_int ldout ); void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, @@ -284,6 +296,10 @@ void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag, void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_double *in, lapack_int ldin, lapack_complex_double *out, lapack_int ldout ); +void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *in, lapack_int ldin, + lapack_complex_double *out, lapack_int ldout ); /* NaN checkers */ #define LAPACK_SISNAN( x ) ( x != x ) @@ -376,6 +392,10 @@ lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_float *a, lapack_int lda ); +lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *a, + lapack_int lda ); lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -440,6 +460,9 @@ lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const double *a, lapack_int lda ); +lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *a, lapack_int lda ); lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -504,6 +527,9 @@ lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const float *a, lapack_int lda ); +lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *a, lapack_int lda ); lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m, lapack_int n, lapack_int kl, @@ -574,6 +600,10 @@ lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag, lapack_int n, const lapack_complex_double *a, lapack_int lda ); +lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *a, + lapack_int lda ); #ifdef __cplusplus } diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 7f827e1c9c..9c02c14458 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -358,6 +358,8 @@ lapacke_clacrm.o \ lapacke_clacrm_work.o \ lapacke_clag2z.o \ lapacke_clag2z_work.o \ +lapacke_clangb.o \ +lapacke_clangb_work.o \ lapacke_clange.o \ lapacke_clange_work.o \ lapacke_clanhe.o \ @@ -842,6 +844,8 @@ lapacke_dlag2s.o \ lapacke_dlag2s_work.o \ lapacke_dlamch.o \ lapacke_dlamch_work.o \ +lapacke_dlangb.o \ +lapacke_dlangb_work.o \ lapacke_dlange.o \ lapacke_dlange_work.o \ lapacke_dlansy.o \ @@ -1414,6 +1418,8 @@ lapacke_slacpy.o \ lapacke_slacpy_work.o \ lapacke_slamch.o \ lapacke_slamch_work.o \ +lapacke_slangb.o \ +lapacke_slangb_work.o \ lapacke_slange.o \ lapacke_slange_work.o \ lapacke_slansy.o \ @@ -2116,6 +2122,8 @@ lapacke_zlacrm.o \ lapacke_zlacrm_work.o \ lapacke_zlag2c.o \ lapacke_zlag2c_work.o \ +lapacke_zlangb.o \ +lapacke_zlangb_work.o \ lapacke_zlange.o \ lapacke_zlange_work.o \ lapacke_zlanhe.o \ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c index 081f5b1298..af6a247edd 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c @@ -61,12 +61,12 @@ lapack_int LAPACKE_cgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -9; LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c index 2257c64df5..632ddd6619 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c @@ -65,12 +65,12 @@ lapack_int LAPACKE_cgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -13; LAPACKE_xerbla( "LAPACKE_cgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 8406635e99..05ff8d57f5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c index ff74939a3a..be0b8347f1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c @@ -72,12 +72,12 @@ lapack_int LAPACKE_cgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -15; LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_cgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c index 7edb1fa9bf..311fe6e0a6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c @@ -76,12 +76,12 @@ lapack_int LAPACKE_cggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_cggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb.c b/lapack-netlib/LAPACKE/src/lapacke_clangb.c new file mode 100644 index 0000000000..0d61575aaa --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_clangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function clangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab ) +{ + lapack_int info = 0; + float res = 0.; + float* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_clangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_clangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_clangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c new file mode 100644 index 0000000000..b5b2cf8163 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c @@ -0,0 +1,84 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function clangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_float* ab, lapack_int ldab, + float* work ) +{ + lapack_int info = 0; + float res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_clangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + float* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_clangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_clangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr.c b/lapack-netlib/LAPACKE/src/lapacke_clantr.c index 88e765f2b8..e00b6c5788 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr.c @@ -33,8 +33,8 @@ #include "lapacke_utils.h" float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, - lapack_int m, lapack_int n, const lapack_complex_float* a, - lapack_int lda ) + lapack_int m, lapack_int n, const lapack_complex_float* a, + lapack_int lda ) { lapack_int info = 0; float res = 0.; @@ -46,7 +46,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_ctz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c index ccd34cecdf..ed12b476eb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c @@ -42,7 +42,9 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; lapack_complex_float* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_clarfb", -1 ); return -1; @@ -50,59 +52,27 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); + return -8; + } + if( LAPACKE_ctz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_cge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); - return -8; - } - if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb", -8 ); - return -8; - } - if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c index 3ad97c22d0..545769b83c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c @@ -42,6 +42,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; lapack_complex_float *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -52,16 +54,14 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -81,6 +81,11 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_clarfb_work", info ); return info; } + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_clarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (lapack_complex_float*) LAPACKE_malloc( sizeof(lapack_complex_float) * @@ -102,36 +107,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 ); - return -8; - } - LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 ); - return -8; - } - LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_ctz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_cge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c b/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c index c543cd44f9..b69c08bb44 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c @@ -86,12 +86,10 @@ lapack_int LAPACKE_ctgsen( int matrix_layout, lapack_int ijob, liwork = iwork_query; lwork = LAPACK_C2INT( work_query ); /* Allocate memory for work arrays */ - if( ijob != 0 ) { - iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_0; - } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; } work = (lapack_complex_float*) LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); @@ -106,9 +104,7 @@ lapack_int LAPACKE_ctgsen( int matrix_layout, lapack_int ijob, /* Release memory and exit */ LAPACKE_free( work ); exit_level_1: - if( ijob != 0 ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_ctgsen", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c index 5ec948e7bb..e01664bdf8 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c @@ -50,16 +50,24 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); lapack_complex_float* v_t = NULL; lapack_complex_float* t_t = NULL; lapack_complex_float* a_t = NULL; lapack_complex_float* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; @@ -69,7 +77,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); return info; @@ -87,13 +95,13 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_0; } t_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,nb) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -105,10 +113,10 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_cge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_ctpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -116,7 +124,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_cge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c new file mode 100644 index 0000000000..c931aac488 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c new file mode 100644 index 0000000000..09c08d92aa --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* c, lapack_int ldc, + float* scale, float* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_complex_float* b_t = NULL; + lapack_complex_float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c index c4de72394e..424f5d1766 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c @@ -59,12 +59,12 @@ lapack_int LAPACKE_dgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -10; LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c index 9efb49ed3a..7f4c6881d5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c @@ -63,12 +63,12 @@ lapack_int LAPACKE_dgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -14; LAPACKE_xerbla( "LAPACKE_dgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c index 4e1b876810..4a0d427b33 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c index effa1b3f5a..bc6bf47d9f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c @@ -70,12 +70,12 @@ lapack_int LAPACKE_dgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_dgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c index ace40a32ab..bde1321d7b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c @@ -73,12 +73,12 @@ lapack_int LAPACKE_dggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -19; LAPACKE_xerbla( "LAPACKE_dggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c new file mode 100644 index 0000000000..ca16ea7f46 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab ) +{ + lapack_int info = 0; + double res = 0.; + double* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dlangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_dlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dlangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c new file mode 100644 index 0000000000..ba04c2b628 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const double* ab, + lapack_int ldab, double* work ) +{ + lapack_int info = 0; + double res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_dlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + double* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_dlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dlangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c index 4d1be93d73..b20af0eb46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c @@ -46,7 +46,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_dtz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c index 3c3c24c54b..f4ddc62a58 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c @@ -41,7 +41,9 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; double* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_dlarfb", -1 ); return -1; @@ -49,59 +51,27 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); + return -8; + } + if( LAPACKE_dtz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_dge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); - return -8; - } - if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb", -8 ); - return -8; - } - if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c index 57c53bae31..de444c1466 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c @@ -41,6 +41,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; double *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -51,16 +53,14 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -80,6 +80,11 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_dlarfb_work", info ); return info; } + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dlarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (double*) LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,ncols_v) ); @@ -98,36 +103,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 ); - return -8; - } - LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 ); - return -8; - } - LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_dtz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_dge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c b/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c index 11496c1c0f..883a795798 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c @@ -83,12 +83,10 @@ lapack_int LAPACKE_dtgsen( int matrix_layout, lapack_int ijob, liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ - if( ijob != 0 ) { - iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_0; - } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; } work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); if( work == NULL ) { @@ -103,9 +101,7 @@ lapack_int LAPACKE_dtgsen( int matrix_layout, lapack_int ijob, /* Release memory and exit */ LAPACKE_free( work ); exit_level_1: - if( ijob != 0 ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_dtgsen", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c index d9ee6226be..366acd3690 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c @@ -48,16 +48,24 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); double* v_t = NULL; double* t_t = NULL; double* a_t = NULL; double* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; @@ -67,7 +75,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); return info; @@ -83,12 +91,12 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } - t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,nb) ); + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -99,10 +107,10 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_dge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_dge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_dtpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -110,7 +118,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_dge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c new file mode 100644 index 0000000000..c95a772deb --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, const double* b, + lapack_int ldb, double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c new file mode 100644 index 0000000000..272c35b384 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const double* a, lapack_int lda, + const double* b, lapack_int ldb, double* c, + lapack_int ldc, double* scale, + lapack_int* iwork, lapack_int liwork, + double* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + double* a_t = NULL; + double* b_t = NULL; + double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c index 0f5a8e0048..af6dbedf0c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c @@ -59,12 +59,12 @@ lapack_int LAPACKE_sgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -10; LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c index d05ea16e9e..67f4982bf2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c @@ -63,12 +63,12 @@ lapack_int LAPACKE_sgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -14; LAPACKE_xerbla( "LAPACKE_sgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c index 0b6406dec6..627d2406cb 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; float* rwork = NULL; float rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c index a3b09de300..1bd3eacf48 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c @@ -70,12 +70,12 @@ lapack_int LAPACKE_sgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_sgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c index d3927e525d..b1fbe19025 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c @@ -73,12 +73,12 @@ lapack_int LAPACKE_sggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -19; LAPACKE_xerbla( "LAPACKE_sggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb.c b/lapack-netlib/LAPACKE/src/lapacke_slangb.c new file mode 100644 index 0000000000..9ba3f30d8d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_slangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function slangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab ) +{ + lapack_int info = 0; + float res = 0.; + float* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_slangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_slangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_slangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c new file mode 100644 index 0000000000..7ef86e9d90 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function slangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, const float* ab, + lapack_int ldab, float* work ) +{ + lapack_int info = 0; + float res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_slangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + float* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_slangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_slangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr.c b/lapack-netlib/LAPACKE/src/lapacke_slantr.c index 2f4c65889a..e2f67cfd6c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr.c @@ -46,7 +46,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_str_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_stz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c index 37d51dee58..d36958f93d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c @@ -41,7 +41,9 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; float* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_slarfb", -1 ); return -1; @@ -49,59 +51,27 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); + return -8; + } + if( LAPACKE_stz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_sge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); - return -8; - } - if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb", -8 ); - return -8; - } - if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c index 2f5d616767..8b61276337 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c @@ -41,6 +41,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; float *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -51,16 +53,14 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -80,6 +80,11 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_slarfb_work", info ); return info; } + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_slarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,ncols_v) ); if( v_t == NULL ) { @@ -97,36 +102,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_str_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 ); - return -8; - } - LAPACKE_str_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_str_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 ); - return -8; - } - LAPACKE_str_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_stz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_sge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_stgsen.c b/lapack-netlib/LAPACKE/src/lapacke_stgsen.c index c3b0c4bf82..db5b7e91c1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stgsen.c @@ -83,12 +83,10 @@ lapack_int LAPACKE_stgsen( int matrix_layout, lapack_int ijob, liwork = iwork_query; lwork = (lapack_int)work_query; /* Allocate memory for work arrays */ - if( ijob != 0 ) { - iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_0; - } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; } work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); if( work == NULL ) { @@ -103,9 +101,7 @@ lapack_int LAPACKE_stgsen( int matrix_layout, lapack_int ijob, /* Release memory and exit */ LAPACKE_free( work ); exit_level_1: - if( ijob != 0 ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_stgsen", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c index 095fbdcd9f..c5a3a14965 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c @@ -48,16 +48,24 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); float* v_t = NULL; float* t_t = NULL; float* a_t = NULL; float* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; @@ -67,7 +75,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); return info; @@ -83,12 +91,12 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_0; } - t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,nb) ); + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -99,10 +107,10 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_sge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_sge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_stpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -110,7 +118,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_sge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c new file mode 100644 index 0000000000..1cfc626c22 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c @@ -0,0 +1,68 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, const float* b, + lapack_int ldb, float* c, lapack_int ldc, + float* scale ) +{ + lapack_int info = 0; + float swork_query[2]; + float* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + lapack_int iwork_query; + lapack_int* iwork = NULL; + lapack_int liwork = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, &iwork_query, liwork, + swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + liwork = iwork_query; + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if ( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } + /* Call middle-level interface */ + info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, iwork, liwork, + swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( iwork ); +exit_level_1: + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c new file mode 100644 index 0000000000..3c50e4a451 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c @@ -0,0 +1,86 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const float* a, lapack_int lda, + const float* b, lapack_int ldb, float* c, + lapack_int ldc, float* scale, + lapack_int* iwork, lapack_int liwork, + float* swork, lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, iwork, &liwork, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + float* a_t = NULL; + float* b_t = NULL; + float* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork, + &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_strsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c index 9393f825a0..445b9dc1c9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c @@ -61,12 +61,12 @@ lapack_int LAPACKE_zgeev_work( int matrix_layout, char jobvl, char jobvr, LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -9; LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgeev_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c index e34112c09f..29dbf06f0d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c @@ -65,12 +65,12 @@ lapack_int LAPACKE_zgeevx_work( int matrix_layout, char balanc, char jobvl, LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; } - if( ldvl < n ) { + if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; } - if( ldvr < n ) { + if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) { info = -13; LAPACKE_xerbla( "LAPACKE_zgeevx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c index 528b94a47e..1d318e5713 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c @@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, lapack_int lrwork = -1; double* rwork = NULL; double rwork_query; - lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 ); return -1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c index 2694c65306..13e2455c64 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c @@ -72,12 +72,12 @@ lapack_int LAPACKE_zgges_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -15; LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -17; LAPACKE_xerbla( "LAPACKE_zgges_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c index f9f1ccee10..fe99949b70 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c @@ -76,12 +76,12 @@ lapack_int LAPACKE_zggesx_work( int matrix_layout, char jobvsl, char jobvsr, LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; } - if( ldvsl < n ) { + if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) { info = -16; LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; } - if( ldvsr < n ) { + if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) { info = -18; LAPACKE_xerbla( "LAPACKE_zggesx_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c new file mode 100644 index 0000000000..3a22ad9822 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c @@ -0,0 +1,73 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab ) +{ + lapack_int info = 0; + double res = 0.; + double* work = NULL; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zlangb", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) { + return -6; + } + } +#endif + /* Allocate memory for working array(s) */ + if( LAPACKE_lsame( norm, 'i' ) ) { + work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call middle-level interface */ + res = LAPACKE_zlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work ); + /* Release memory and exit */ + if( LAPACKE_lsame( norm, 'i' ) ) { + LAPACKE_free( work ); + } +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zlangb", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c new file mode 100644 index 0000000000..d64fb482d5 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c @@ -0,0 +1,84 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zlangb +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n, + lapack_int kl, lapack_int ku, + const lapack_complex_double* ab, lapack_int ldab, + double* work ) +{ + lapack_int info = 0; + double res = 0.; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + res = LAPACK_zlangb( &norm, &n, &kl, &ku, ab, &ldab, work ); + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + char norm_lapack; + double* work_lapack = NULL; + /* Check leading dimension(s) */ + if( ldab < kl+ku+1 ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + return info; + } + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + /* Allocate memory for work array(s) */ + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); + if( work_lapack == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + } + /* Call LAPACK function */ + res = LAPACK_zlangb( &norm, &n, &ku, &kl, ab, &ldab, work ); + /* Release memory and exit */ + if( work_lapack ) { + LAPACKE_free( work_lapack ); + } +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zlangb_work", info ); + } + return res; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c index f6656d84da..4c078b9b0f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c @@ -46,7 +46,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) { + if( LAPACKE_ztz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c index 7cd23dde8f..85355b202d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c @@ -42,7 +42,9 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct lapack_int info = 0; lapack_int ldwork; lapack_complex_double* work = NULL; - lapack_int ncols_v, nrows_v; + lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zlarfb", -1 ); return -1; @@ -50,59 +52,27 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - lapack_int lrv, lcv; /* row, column stride */ - if( matrix_layout == LAPACK_COL_MAJOR ) { - lrv = 1; - lcv = ldv; - } else { - lrv = ldv; - lcv = 1; - } - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { - return -13; + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); + return -8; + } + if( LAPACKE_ztz_nancheck( matrix_layout, direct, uplo, 'u', + nrows_v, ncols_v, v, ldv ) ) { + return -9; } if( LAPACKE_zge_nancheck( matrix_layout, k, k, t, ldt ) ) { return -11; } - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); - return -8; - } - if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, - &v[(nrows_v-k)*lrv], ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) { - if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, - &v[k*lrv], ldv ) ) - return -9; - } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb", -8 ); - return -8; - } - if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, - &v[(ncols_v-k)*lcv], ldv ) ) - return -9; - if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) ) - return -9; + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -13; } } #endif diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c index 1b4f892a17..72d85ec82a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c @@ -42,6 +42,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, { lapack_int info = 0; lapack_int nrows_v, ncols_v; + lapack_logical left, col, forward; + char uplo; lapack_int ldc_t, ldt_t, ldv_t; lapack_complex_double *v_t = NULL, *t_t = NULL, *c_t = NULL; if( matrix_layout == LAPACK_COL_MAJOR ) { @@ -52,16 +54,14 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - nrows_v = ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : - ( LAPACKE_lsame( storev, 'r' ) ? k : 1) ); - ncols_v = LAPACKE_lsame( storev, 'c' ) ? k : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'l' ) ) ? m : - ( ( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( side, 'r' ) ) ? n : 1) ); + left = LAPACKE_lsame( side, 'l' ); + col = LAPACKE_lsame( storev, 'c' ); + forward = LAPACKE_lsame( direct, 'f' ); + + nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) ); + ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) ); + uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u'; + ldc_t = MAX(1,m); ldt_t = MAX(1,k); ldv_t = MAX(1,nrows_v); @@ -81,6 +81,11 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_zlarfb_work", info ); return info; } + if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zlarfb_work", info ); + return info; + } /* Allocate memory for temporary array(s) */ v_t = (lapack_complex_double*) LAPACKE_malloc( sizeof(lapack_complex_double) * @@ -102,36 +107,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans, goto exit_level_2; } /* Transpose input matrices */ - if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv, - &v_t[k], ldv_t ); - } else if( LAPACKE_lsame( storev, 'c' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > nrows_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 ); - return -8; - } - LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv], - ldv, &v_t[nrows_v-k], ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t, - ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'f' ) ) { - LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv, - &v_t[k*ldv_t], ldv_t ); - } else if( LAPACKE_lsame( storev, 'r' ) && - LAPACKE_lsame( direct, 'b' ) ) { - if( k > ncols_v ) { - LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 ); - return -8; - } - LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv, - &v_t[(ncols_v-k)*ldv_t], ldv_t ); - LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t, - ldv_t ); - } + LAPACKE_ztz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v, + v, ldv, v_t, ldv_t ); LAPACKE_zge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t ); LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c b/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c index 8c86d5e00e..039da18b48 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c @@ -86,12 +86,10 @@ lapack_int LAPACKE_ztgsen( int matrix_layout, lapack_int ijob, liwork = iwork_query; lwork = LAPACK_Z2INT( work_query ); /* Allocate memory for work arrays */ - if( ijob != 0 ) { - iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); - if( iwork == NULL ) { - info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_0; - } + iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); + if( iwork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; } work = (lapack_complex_double*) LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); @@ -106,9 +104,7 @@ lapack_int LAPACKE_ztgsen( int matrix_layout, lapack_int ijob, /* Release memory and exit */ LAPACKE_free( work ); exit_level_1: - if( ijob != 0 ) { - LAPACKE_free( iwork ); - } + LAPACKE_free( iwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_ztgsen", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c index 643ae1d9d5..104efa8f3c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c @@ -50,16 +50,24 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,k); + lapack_int nrowsA, ncolsA, nrowsV; + if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + else { + info = -2; + LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); + return info; + } + lapack_int lda_t = MAX(1,nrowsA); lapack_int ldb_t = MAX(1,m); - lapack_int ldt_t = MAX(1,ldt); - lapack_int ldv_t = MAX(1,ldv); + lapack_int ldt_t = MAX(1,nb); + lapack_int ldv_t = MAX(1,nrowsV); lapack_complex_double* v_t = NULL; lapack_complex_double* t_t = NULL; lapack_complex_double* a_t = NULL; lapack_complex_double* b_t = NULL; /* Check leading dimension(s) */ - if( lda < m ) { + if( lda < ncolsA ) { info = -14; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; @@ -69,7 +77,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; } - if( ldt < nb ) { + if( ldt < k ) { info = -12; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); return info; @@ -87,13 +95,13 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_0; } t_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,nb) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,k) ); if( t_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_1; } a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,ncolsA) ); if( a_t == NULL ) { info = LAPACK_TRANSPOSE_MEMORY_ERROR; goto exit_level_2; @@ -105,10 +113,10 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, goto exit_level_3; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t ); - LAPACKE_zge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t ); - LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); - LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t ); /* Call LAPACK function and adjust info */ LAPACK_ztpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t, &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info ); @@ -116,7 +124,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, info = info - 1; } /* Transpose output matrices */ - LAPACKE_zge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda ); LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb ); /* Release memory and exit */ LAPACKE_free( b_t ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c new file mode 100644 index 0000000000..dbc9bcf9f7 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c @@ -0,0 +1,56 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale ) +{ + lapack_int info = 0; + double swork_query[2]; + double* swork = NULL; + lapack_int ldswork = -1; + lapack_int swork_size = -1; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) { + return -7; + } + if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + return -9; + } + if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { + return -11; + } + } +#endif + /* Query optimal working array sizes */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda, + b, ldb, c, ldc, scale, swork_query, ldswork ); + if( info != 0 ) { + goto exit_level_0; + } + ldswork = swork_query[0]; + swork_size = ldswork * swork_query[1]; + swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size); + if( swork == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, + lda, b, ldb, c, ldc, scale, swork, ldswork ); + /* Release memory and exit */ + LAPACKE_free( swork ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c new file mode 100644 index 0000000000..a7ebd5da60 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c @@ -0,0 +1,88 @@ +#include "lapacke_utils.h" + +lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb, + lapack_int isgn, lapack_int m, lapack_int n, + const lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* c, lapack_int ldc, + double* scale, double* swork, + lapack_int ldswork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc, + scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_int ldb_t = MAX(1,n); + lapack_int ldc_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_complex_double* b_t = NULL; + lapack_complex_double* c_t = NULL; + /* Check leading dimension(s) */ + if( lda < m ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldb < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + if( ldc < n ) { + info = -12; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + return info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + b_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) ); + if( b_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + c_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) ); + if( c_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_2; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t ); + LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); + /* Call LAPACK function and adjust info */ + LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t, + c_t, &ldc_t, scale, swork, &ldswork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc ); + /* Release memory and exit */ + LAPACKE_free( c_t ); +exit_level_2: + LAPACKE_free( b_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info ); + } + return info; +} diff --git a/lapack-netlib/LAPACKE/utils/CMakeLists.txt b/lapack-netlib/LAPACKE/utils/CMakeLists.txt index dd36ee33e7..dfb9aa3702 100644 --- a/lapack-netlib/LAPACKE/utils/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/utils/CMakeLists.txt @@ -1,39 +1,46 @@ set(UTILS -lapacke_c_nancheck.c lapacke_ctr_trans.c lapacke_make_complex_float.c lapacke_zgb_nancheck.c -lapacke_cgb_nancheck.c lapacke_d_nancheck.c lapacke_s_nancheck.c lapacke_zgb_trans.c -lapacke_cgb_trans.c lapacke_dgb_nancheck.c lapacke_sgb_nancheck.c lapacke_zge_nancheck.c -lapacke_cge_nancheck.c lapacke_dgb_trans.c lapacke_sgb_trans.c lapacke_zge_trans.c -lapacke_cge_trans.c lapacke_dge_nancheck.c lapacke_sge_nancheck.c lapacke_zgg_nancheck.c -lapacke_cgg_nancheck.c lapacke_dge_trans.c lapacke_sge_trans.c lapacke_zgg_trans.c -lapacke_cgg_trans.c lapacke_dgg_nancheck.c lapacke_sgg_nancheck.c lapacke_zgt_nancheck.c -lapacke_cgt_nancheck.c lapacke_dgg_trans.c lapacke_sgg_trans.c lapacke_zhb_nancheck.c -lapacke_chb_nancheck.c lapacke_dgt_nancheck.c lapacke_sgt_nancheck.c lapacke_zhb_trans.c -lapacke_chb_trans.c lapacke_dhs_nancheck.c lapacke_shs_nancheck.c lapacke_zhe_nancheck.c -lapacke_che_nancheck.c lapacke_dhs_trans.c lapacke_shs_trans.c lapacke_zhe_trans.c -lapacke_che_trans.c lapacke_dpb_nancheck.c lapacke_spb_nancheck.c lapacke_zhp_nancheck.c -lapacke_chp_nancheck.c lapacke_dpb_trans.c lapacke_spb_trans.c lapacke_zhp_trans.c -lapacke_chp_trans.c lapacke_dpf_nancheck.c lapacke_spf_nancheck.c lapacke_zhs_nancheck.c -lapacke_chs_nancheck.c lapacke_dpf_trans.c lapacke_spf_trans.c lapacke_zhs_trans.c -lapacke_chs_trans.c lapacke_dpo_nancheck.c lapacke_spo_nancheck.c lapacke_zpb_nancheck.c -lapacke_cpb_nancheck.c lapacke_dpo_trans.c lapacke_spo_trans.c lapacke_zpb_trans.c -lapacke_cpb_trans.c lapacke_dpp_nancheck.c lapacke_spp_nancheck.c lapacke_zpf_nancheck.c -lapacke_cpf_nancheck.c lapacke_dpp_trans.c lapacke_spp_trans.c lapacke_zpf_trans.c -lapacke_cpf_trans.c lapacke_dpt_nancheck.c lapacke_spt_nancheck.c lapacke_zpo_nancheck.c -lapacke_cpo_nancheck.c lapacke_dsb_nancheck.c lapacke_ssb_nancheck.c lapacke_zpo_trans.c -lapacke_cpo_trans.c lapacke_dsb_trans.c lapacke_ssb_trans.c lapacke_zpp_nancheck.c -lapacke_cpp_nancheck.c lapacke_dsp_nancheck.c lapacke_ssp_nancheck.c lapacke_zpp_trans.c -lapacke_cpp_trans.c lapacke_dsp_trans.c lapacke_ssp_trans.c lapacke_zpt_nancheck.c -lapacke_cpt_nancheck.c lapacke_dst_nancheck.c lapacke_sst_nancheck.c lapacke_zsp_nancheck.c -lapacke_csp_nancheck.c lapacke_dsy_nancheck.c lapacke_ssy_nancheck.c lapacke_zsp_trans.c -lapacke_csp_trans.c lapacke_dsy_trans.c lapacke_ssy_trans.c lapacke_zst_nancheck.c -lapacke_cst_nancheck.c lapacke_dtb_nancheck.c lapacke_stb_nancheck.c lapacke_zsy_nancheck.c -lapacke_csy_nancheck.c lapacke_dtb_trans.c lapacke_stb_trans.c lapacke_zsy_trans.c -lapacke_csy_trans.c lapacke_dtf_nancheck.c lapacke_stf_nancheck.c lapacke_ztb_nancheck.c -lapacke_ctb_nancheck.c lapacke_dtf_trans.c lapacke_stf_trans.c lapacke_ztb_trans.c -lapacke_ctb_trans.c lapacke_dtp_nancheck.c lapacke_stp_nancheck.c lapacke_ztf_nancheck.c -lapacke_ctf_nancheck.c lapacke_dtp_trans.c lapacke_stp_trans.c lapacke_ztf_trans.c -lapacke_ctf_trans.c lapacke_dtr_nancheck.c lapacke_str_nancheck.c lapacke_ztp_nancheck.c -lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c -lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c -lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c +lapacke_c_nancheck.c lapacke_d_nancheck.c lapacke_s_nancheck.c lapacke_z_nancheck.c +lapacke_cgb_nancheck.c lapacke_dgb_nancheck.c lapacke_sgb_nancheck.c lapacke_zgb_trans.c +lapacke_cgb_trans.c lapacke_dgb_trans.c lapacke_sgb_trans.c lapacke_zgb_nancheck.c +lapacke_cge_nancheck.c lapacke_dge_nancheck.c lapacke_sge_nancheck.c lapacke_zge_nancheck.c +lapacke_cge_trans.c lapacke_dge_trans.c lapacke_sge_trans.c lapacke_zge_trans.c +lapacke_cgg_nancheck.c lapacke_dgg_nancheck.c lapacke_sgg_nancheck.c lapacke_zgg_nancheck.c +lapacke_cgg_trans.c lapacke_dgg_trans.c lapacke_sgg_trans.c lapacke_zgg_trans.c +lapacke_cgt_nancheck.c lapacke_dgt_nancheck.c lapacke_sgt_nancheck.c lapacke_zgt_nancheck.c +lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_ssb_nancheck.c lapacke_zhb_nancheck.c +lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_ssb_trans.c lapacke_zhb_trans.c +lapacke_che_nancheck.c lapacke_zhe_nancheck.c +lapacke_che_trans.c lapacke_zhe_trans.c +lapacke_chp_nancheck.c lapacke_zhp_nancheck.c +lapacke_chp_trans.c lapacke_zhp_trans.c +lapacke_chs_nancheck.c lapacke_dhs_nancheck.c lapacke_shs_nancheck.c lapacke_zhs_nancheck.c +lapacke_chs_trans.c lapacke_dhs_trans.c lapacke_shs_trans.c lapacke_zhs_trans.c +lapacke_cpb_nancheck.c lapacke_dpb_nancheck.c lapacke_spb_nancheck.c lapacke_zpb_nancheck.c +lapacke_cpb_trans.c lapacke_dpb_trans.c lapacke_spb_trans.c lapacke_zpb_trans.c +lapacke_cpf_nancheck.c lapacke_dpf_nancheck.c lapacke_spf_nancheck.c lapacke_zpf_nancheck.c +lapacke_cpf_trans.c lapacke_dpf_trans.c lapacke_spf_trans.c lapacke_zpf_trans.c +lapacke_cpo_nancheck.c lapacke_dpo_nancheck.c lapacke_spo_nancheck.c lapacke_zpo_nancheck.c +lapacke_cpo_trans.c lapacke_dpo_trans.c lapacke_spo_trans.c lapacke_zpo_trans.c +lapacke_cpp_nancheck.c lapacke_dpp_nancheck.c lapacke_spp_nancheck.c lapacke_zpp_nancheck.c +lapacke_cpp_trans.c lapacke_dpp_trans.c lapacke_spp_trans.c lapacke_zpp_trans.c +lapacke_cpt_nancheck.c lapacke_dpt_nancheck.c lapacke_spt_nancheck.c lapacke_zpt_nancheck.c +lapacke_csp_nancheck.c lapacke_dsp_nancheck.c lapacke_ssp_nancheck.c lapacke_zsp_nancheck.c +lapacke_csp_trans.c lapacke_dsp_trans.c lapacke_ssp_trans.c lapacke_zsp_trans.c +lapacke_cst_nancheck.c lapacke_dst_nancheck.c lapacke_sst_nancheck.c lapacke_zst_nancheck.c +lapacke_csy_nancheck.c lapacke_dsy_nancheck.c lapacke_ssy_nancheck.c lapacke_zsy_nancheck.c +lapacke_csy_trans.c lapacke_dsy_trans.c lapacke_ssy_trans.c lapacke_zsy_trans.c +lapacke_ctb_nancheck.c lapacke_dtb_nancheck.c lapacke_stb_nancheck.c lapacke_ztb_nancheck.c +lapacke_ctb_trans.c lapacke_dtb_trans.c lapacke_stb_trans.c lapacke_ztb_trans.c +lapacke_ctf_nancheck.c lapacke_dtf_nancheck.c lapacke_stf_nancheck.c lapacke_ztf_nancheck.c +lapacke_ctf_trans.c lapacke_dtf_trans.c lapacke_stf_trans.c lapacke_ztf_trans.c +lapacke_ctp_nancheck.c lapacke_dtp_nancheck.c lapacke_stp_nancheck.c lapacke_ztp_nancheck.c +lapacke_ctp_trans.c lapacke_dtp_trans.c lapacke_stp_trans.c lapacke_ztp_trans.c +lapacke_ctr_nancheck.c lapacke_dtr_nancheck.c lapacke_str_nancheck.c lapacke_ztr_nancheck.c +lapacke_ctr_trans.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztr_trans.c +lapacke_ctz_nancheck.c lapacke_dtz_nancheck.c lapacke_stz_nancheck.c lapacke_ztz_nancheck.c +lapacke_ctz_trans.c lapacke_dtz_trans.c lapacke_stz_trans.c lapacke_ztz_trans.c + +lapacke_make_complex_float.c lapacke_make_complex_double.c +lapacke_lsame.c +lapacke_xerbla.c ) diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile index adc5736507..a1f8631071 100644 --- a/lapack-netlib/LAPACKE/utils/Makefile +++ b/lapack-netlib/LAPACKE/utils/Makefile @@ -76,6 +76,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_ctp_trans.o \ lapacke_ctr_nancheck.o \ lapacke_ctr_trans.o \ + lapacke_ctz_nancheck.o \ + lapacke_ctz_trans.o \ lapacke_dgb_nancheck.o \ lapacke_dgb_trans.o \ lapacke_dge_nancheck.o \ @@ -110,6 +112,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_dtp_trans.o \ lapacke_dtr_nancheck.o \ lapacke_dtr_trans.o \ + lapacke_dtz_nancheck.o \ + lapacke_dtz_trans.o \ lapacke_lsame.o \ lapacke_sgb_nancheck.o \ lapacke_sgb_trans.o \ @@ -145,6 +149,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_stp_trans.o \ lapacke_str_nancheck.o \ lapacke_str_trans.o \ + lapacke_stz_nancheck.o \ + lapacke_stz_trans.o \ lapacke_xerbla.o \ lapacke_zgb_nancheck.o \ lapacke_zgb_trans.o \ @@ -184,6 +190,8 @@ OBJ = lapacke_cgb_nancheck.o \ lapacke_ztp_trans.o \ lapacke_ztr_nancheck.o \ lapacke_ztr_trans.o \ + lapacke_ztz_nancheck.o \ + lapacke_ztz_trans.o \ lapacke_make_complex_float.o \ lapacke_make_complex_double.o diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c new file mode 100644 index 0000000000..bea9567811 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c @@ -0,0 +1,144 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *a, + lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_cge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c new file mode 100644 index 0000000000..48d346611c --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_float *in, lapack_int ldin, + lapack_complex_float *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_cge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c new file mode 100644 index 0000000000..cd2ae6731a --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c @@ -0,0 +1,143 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *a, lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_dge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda ) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c new file mode 100644 index 0000000000..b39000d42a --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const double *in, lapack_int ldin, + double *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_dge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_lsame.c b/lapack-netlib/LAPACKE/utils/lapacke_lsame.c index e4592ce114..3657e465e8 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_lsame.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_lsame.c @@ -32,9 +32,16 @@ #include "lapacke_utils.h" +#ifdef __EMSCRIPTEN__ +lapack_logical LAPACKE_lsame( char ca, char cb ) +{ + return (lapack_logical) LAPACK_lsame( &ca, &cb ); +} +#else lapack_logical LAPACKE_lsame( char ca, char cb ) { return (lapack_logical) LAPACK_lsame( &ca, &cb, 1, 1 ); } +#endif diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c new file mode 100644 index 0000000000..7d7c30f96c --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c @@ -0,0 +1,143 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *a, lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_sge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_str_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c new file mode 100644 index 0000000000..cffee6c986 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const float *in, lapack_int ldin, + float *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_sge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c new file mode 100644 index 0000000000..481fa4c033 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c @@ -0,0 +1,144 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal + matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses + the diagonal which shall be considered and `uplo` tells us whether we use the + upper or lower part of the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *a, + lapack_int lda ) +{ + lapack_logical colmaj, front, lower, unit; + + if( a == NULL ) return (lapack_logical) 0; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return (lapack_logical) 0; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_offset = tri_n * ( !colmaj ? lda : 1 ); + } else if( !lower && n > m ) { + rect_offset = tri_n * ( colmaj ? lda : 1 ); + } + } else { + if( m > n ) { + tri_offset = rect_m * ( !colmaj ? lda : 1 ); + if( !lower ) { + rect_offset = 0; + } + } else if( n > m ) { + tri_offset = rect_n * ( colmaj ? lda : 1 ); + if( lower ) { + rect_offset = 0; + } + } + } + + /* Check rectangular part */ + if( rect_offset >= 0 ) { + if( LAPACKE_zge_nancheck( matrix_layout, rect_m, rect_n, + &a[rect_offset], lda) ) { + return (lapack_logical) 1; + } + } + + /* Check triangular part */ + return LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, tri_n, + &a[tri_offset], lda ); +} diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c new file mode 100644 index 0000000000..faef6da504 --- /dev/null +++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c @@ -0,0 +1,153 @@ +/***************************************************************************** + Copyright (c) 2022, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +****************************************************************************** +* Contents: Native C interface to LAPACK utility function +* Author: Simon Märtens +*****************************************************************************/ + +#include "lapacke_utils.h" + +/***************************************************************************** + Converts input triangular matrix from row-major(C) to column-major(Fortran) + layout or vice versa. The shape of the trapezoidal matrix is determined by + the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall + be considered and `uplo` tells us whether we use the upper or lower part of + the matrix with respect to the chosen diagonal. + + Diagonals 'F' (front / forward) and 'B' (back / backward): + + A = ( F ) A = ( F B ) + ( F ) ( F B ) + ( B F ) ( F B ) + ( B ) + ( B ) + + direct = 'F', uplo = 'L': + + A = ( * ) A = ( * ) + ( * * ) ( * * ) + ( * * * ) ( * * * ) + ( * * * ) + ( * * * ) + + direct = 'F', uplo = 'U': + + A = ( * * * ) A = ( * * * * * ) + ( * * ) ( * * * * ) + ( * ) ( * * * ) + ( ) + ( ) + + direct = 'B', uplo = 'L': + + A = ( ) A = ( * * * ) + ( ) ( * * * * ) + ( * ) ( * * * * * ) + ( * * ) + ( * * * ) + + direct = 'B', uplo = 'U': + + A = ( * * * ) A = ( * * * ) + ( * * * ) ( * * ) + ( * * * ) ( * ) + ( * * ) + ( * ) + +*****************************************************************************/ + +void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo, + char diag, lapack_int m, lapack_int n, + const lapack_complex_double *in, lapack_int ldin, + lapack_complex_double *out, lapack_int ldout ) +{ + lapack_logical colmaj, front, lower, unit; + + if( in == NULL || out == NULL ) return ; + + colmaj = ( matrix_layout == LAPACK_COL_MAJOR ); + front = LAPACKE_lsame( direct, 'f' ); + lower = LAPACKE_lsame( uplo, 'l' ); + unit = LAPACKE_lsame( diag, 'u' ); + + if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) || + ( !front && !LAPACKE_lsame( direct, 'b' ) ) || + ( !lower && !LAPACKE_lsame( uplo, 'u' ) ) || + ( !unit && !LAPACKE_lsame( diag, 'n' ) ) ) { + /* Just exit if any of input parameters are wrong */ + return; + } + + /* Initial offsets and sizes of triangular and rectangular parts */ + lapack_int tri_in_offset = 0; + lapack_int tri_out_offset = 0; + lapack_int tri_n = MIN(m,n); + lapack_int rect_in_offset = -1; + lapack_int rect_out_offset = -1; + lapack_int rect_m = ( m > n ) ? m - n : m; + lapack_int rect_n = ( n > m ) ? n - m : n; + + /* Fix offsets depending on the shape of the matrix */ + if( front ) { + if( lower && m > n ) { + rect_in_offset = tri_n * ( !colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( colmaj ? ldout : 1 ); + } else if( !lower && n > m ) { + rect_in_offset = tri_n * ( colmaj ? ldin : 1 ); + rect_out_offset = tri_n * ( !colmaj ? ldout : 1 ); + } + } else { + if( m > n ) { + tri_in_offset = rect_m * ( !colmaj ? ldin : 1 ); + tri_out_offset = rect_m * ( colmaj ? ldout : 1 ); + if( !lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } else if( n > m ) { + tri_in_offset = rect_n * ( colmaj ? ldin : 1 ); + tri_out_offset = rect_n * ( !colmaj ? ldout : 1 ); + if( lower ) { + rect_in_offset = 0; + rect_out_offset = 0; + } + } + } + + /* Copy & transpose rectangular part */ + if( rect_in_offset >= 0 && rect_out_offset >= 0 ) { + LAPACKE_zge_trans( matrix_layout, rect_m, rect_n, + &in[rect_in_offset], ldin, + &out[rect_out_offset], ldout ); + } + + /* Copy & transpose triangular part */ + LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n, + &in[tri_in_offset], ldin, + &out[tri_out_offset], ldout ); +} diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md index f0aed6c185..142aa7b720 100644 --- a/lapack-netlib/README.md +++ b/lapack-netlib/README.md @@ -1,6 +1,8 @@ # LAPACK [![Build Status](https://travis-ci.org/Reference-LAPACK/lapack.svg?branch=master)](https://travis-ci.org/Reference-LAPACK/lapack) +![CMake](/~https://github.com/Reference-LAPACK/lapack/actions/workflows/cmake.yml/badge.svg) +![Makefile](/~https://github.com/Reference-LAPACK/lapack/actions/workflows/makefile.yml/badge.svg) [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/) [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack) [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions) @@ -31,6 +33,10 @@ * VERSION 3.7.1 : June 2017 * VERSION 3.8.0 : November 2017 * VERSION 3.9.0 : November 2019 +* VERSION 3.9.1 : April 2021 +* VERSION 3.10.0 : June 2021 +* VERSION 3.10.1 : April 2022 +* VERSION 3.11.0 : November 2022 LAPACK is a library of Fortran subroutines for solving the most commonly occurring problems in numerical linear algebra. @@ -38,8 +44,7 @@ occurring problems in numerical linear algebra. LAPACK is a freely-available software package. It can be included in commercial software packages (and has been). We only ask that that proper credit be given to the authors, for example by citing the LAPACK Users' Guide. The license used -for the software is the modified BSD license, see: -/~https://github.com/Reference-LAPACK/lapack/blob/master/LICENSE +for the software is the [modified BSD license](/~https://github.com/Reference-LAPACK/lapack/blob/master/LICENSE). Like all software, it is copyrighted. It is not trademarked, but we do ask the following: if you modify the source for these routines we ask that you change @@ -49,11 +54,8 @@ We will gladly answer any questions regarding the software. If a modification is done, however, it is the responsibility of the person who modified the routine to provide support. -LAPACK is available from github at: -/~https://github.com/reference-lapack/lapack - -LAPACK releases are also available on netlib at: -http://www.netlib.org/lapack/ +LAPACK is [available from GitHub](/~https://github.com/Reference-LAPACK/lapack). +LAPACK releases are also [available on netlib](http://www.netlib.org/lapack/). The distribution contains (1) the Fortran source for LAPACK, and (2) its testing programs. It also contains (3) the Fortran reference implementation of @@ -65,24 +67,29 @@ CBLAS, a C interface to the BLAS, and (5) LAPACKE, a C interface to LAPACK. ## Installation - - LAPACK can be installed with `make`. The configuration have to be set in the + - LAPACK can be installed with `make`. The configuration must be set in the `make.inc` file. A `make.inc.example` for a Linux machine running GNU compilers is given in the main directory. Some specific `make.inc` are also available in the `INSTALL` directory. - - LAPACK includes also the CMake build. You will need to have CMake installed - on your machine (CMake is available at http://www.cmake.org/). CMake will - allow an easy installation on a Windows Machine. - An example CMake build is: + - LAPACK includes also the [CMake](https://cmake.org/) build. You will need + to have CMake installed on your machine. CMake will allow an easy + installation on a Windows Machine. An example CMake build to install the + LAPACK library under `$HOME/.local/lapack/` is: ```sh mkdir build cd build cmake -DCMAKE_INSTALL_LIBDIR=$HOME/.local/lapack .. - cmake --build -j . --target install + cmake --build . -j --target install ``` - That installs the LAPACK library under $HOME/.local/lapack/ - - Specific information to run LAPACK under Windows is available at - http://icl.cs.utk.edu/lapack-for-windows/lapack/. - + - LAPACK can be built and installed using [vcpkg](/~https://github.com/Microsoft/vcpkg/) dependency manager: + ```sh + git clone /~https://github.com/Microsoft/vcpkg.git + cd vcpkg + ./bootstrap-vcpkg.sh # ./bootstrap-vcpkg.bat for Windows + ./vcpkg integrate install + ./vcpkg install lapack + ``` + The lapack port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](/~https://github.com/Microsoft/vcpkg) on the vcpkg repository. ## User Support @@ -90,20 +97,17 @@ LAPACK has been thoroughly tested, on many different types of computers. The LAPACK project supports the package in the sense that reports of errors or poor performance will gain immediate attention from the developers. Such reports, descriptions of interesting applications, and other comments should be sent by -electronic mail to lapack@icl.utk.edu. - -For further information on LAPACK please read our FAQ at -http://www.netlib.org/lapack/#_faq. +email to [the LAPACK team](mailto:lapack@icl.utk.edu). A list of known problems, bugs, and compiler errors for LAPACK is -maintained on netlib -http://www.netlib.org/lapack/release_notes.html. -Please see as well -/~https://github.com/Reference-LAPACK/lapack/issues. +[maintained on netlib](http://www.netlib.org/lapack/release_notes.html). +Please see as well the [GitHub issue tracker](/~https://github.com/Reference-LAPACK/lapack/issues). -A User forum is also available to help you with the LAPACK library at -http://icl.cs.utk.edu/lapack-forum/. -You can also contact directly the LAPACK team at lapack@icl.utk.edu. +For further information on LAPACK please read our [FAQ](http://www.netlib.org/lapack/faq.html) +and [Users' Guide](http://www.netlib.org/lapack/lug/lapack_lug.html). +A [user forum](http://icl.cs.utk.edu/lapack-forum/) and specific information for +[running LAPACK under Windows](http://icl.cs.utk.edu/lapack-for-windows/lapack/). +is also available to help you with the LAPACK library. ## Testing @@ -114,14 +118,9 @@ you run the test suite. For complete information on the LAPACK Testing please consult LAPACK Working Note 41 "Installation Guide for LAPACK". -## User Guide - -To view an HTML version of the Users' Guide please refer to the URL - http://www.netlib.org/lapack/lug/lapack_lug.html. ## LAPACKE -LAPACK now includes the LAPACKE package. LAPACKE is a Standard C language API -for LAPACK This was born from a collaboration of the LAPACK and INTEL Math -Kernel Library teams. See: -http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack. +LAPACK now includes the [LAPACKE](http://www.netlib.org/lapack/lapacke.html) +package. LAPACKE is a Standard C language API for LAPACK that was born from a +collaboration of the LAPACK and INTEL Math Kernel Library teams. diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.c b/lapack-netlib/SRC/DEPRECATED/cgegs.c index 45e2f5e63d..35b59e683c 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegs.c +++ b/lapack-netlib/SRC/DEPRECATED/cgegs.c @@ -740,7 +740,7 @@ rices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgegs_(char *jobvsl, char *jobvsr, integer *n, complex * +/* Subroutine */ void cgegs_(char *jobvsl, char *jobvsr, integer *n, complex * a, integer *lda, complex *b, integer *ldb, complex *alpha, complex * beta, complex *vsl, integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer *lwork, real *rwork, integer *info) @@ -758,7 +758,7 @@ rices */ integer iwork; logical ilvsr; integer irows; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, @@ -766,24 +766,24 @@ rices */ integer nb; extern real clange_(char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *); @@ -791,7 +791,7 @@ rices */ real anrmto; integer lwkmin, nb1, nb2, nb3; real bnrmto; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer @@ -898,16 +898,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("CGEGS ", &i__1); - return 0; + xerbla_("CGEGS ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -934,7 +934,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -955,7 +955,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1092,13 +1092,13 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } clascl_("G", &c_n1, &c_n1, &anrmto, &anrm, n, &c__1, &alpha[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1107,20 +1107,20 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } clascl_("G", &c_n1, &c_n1, &bnrmto, &bnrm, n, &c__1, &beta[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } L10: work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CGEGS */ diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.c b/lapack-netlib/SRC/DEPRECATED/cgegv.c index 06b85adc43..5a5ea919fd 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/cgegv.c @@ -799,7 +799,7 @@ rices */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgegv_(char *jobvl, char *jobvr, integer *n, complex *a, +/* Subroutine */ void cgegv_(char *jobvl, char *jobvr, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *alpha, complex *beta, complex *vl, integer *ldvl, complex *vr, integer *ldvr, complex * work, integer *lwork, real *rwork, integer *info) @@ -819,7 +819,7 @@ rices */ real anrm1, anrm2, bnrm1, bnrm2, absai, scale, absar, sbeta; extern logical lsame_(char *, char *); integer ileft, iinfo, icols, iwork, irows, jc; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, @@ -828,40 +828,40 @@ rices */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer jr; - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); real salfai; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real salfar; extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); real safmin; - extern /* Subroutine */ int ctgevc_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ctgevc_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, complex *, real *, integer *); real safmax; char chtemp[1]; logical ldumma[1]; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, - complex *, integer *, real *, integer *), - xerbla_(char *, integer *); + complex *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvl, iright; logical ilimit; integer ijobvr; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer lwkmin, nb1, nb2, nb3; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer irwork, lwkopt; @@ -969,16 +969,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("CGEGV ", &i__1); - return 0; + xerbla_("CGEGV ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1005,7 +1005,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1026,7 +1026,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1353,7 +1353,7 @@ rices */ L80: work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CGEGV */ diff --git a/lapack-netlib/SRC/DEPRECATED/cgelsx.c b/lapack-netlib/SRC/DEPRECATED/cgelsx.c index e6c742db40..bdc395acf5 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/cgelsx.c @@ -700,7 +700,7 @@ f"> */ /* > \ingroup complexGEsolve */ /* ===================================================================== */ -/* Subroutine */ int cgelsx_(integer *m, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgelsx_(integer *m, integer *n, integer *nrhs, complex * a, integer *lda, complex *b, integer *ldb, integer *jpvt, real *rcond, integer *rank, complex *work, real *rwork, integer *info) { @@ -712,31 +712,31 @@ f"> */ real anrm, bnrm, smin, smax; integer i__, j, k, iascl, ibscl, ismin, ismax; complex c1, c2; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), claic1_(integer *, integer *, complex *, real *, complex *, complex *, real *, complex *, complex *); complex s1, s2, t1, t2; - extern /* Subroutine */ int cunm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunm2r_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *), slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer mn; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), cgeqpf_(integer *, integer *, complex *, integer *, integer *, complex *, complex *, real *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int clatzm_(char *, integer *, integer *, complex + extern /* Subroutine */ void clatzm_(char *, integer *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *); real sminpr; - extern /* Subroutine */ int ctzrqf_(integer *, integer *, complex *, + extern /* Subroutine */ void ctzrqf_(integer *, integer *, complex *, integer *, complex *, integer *); real smaxpr, smlnum; @@ -787,8 +787,8 @@ f"> */ if (*info != 0) { i__1 = -(*info); - xerbla_("CGELSX", &i__1); - return 0; + xerbla_("CGELSX", &i__1, 6); + return; } /* Quick return if possible */ @@ -797,7 +797,7 @@ f"> */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*nrhs) == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1029,7 +1029,7 @@ f"> */ L100: - return 0; + return; /* End of CGELSX */ diff --git a/lapack-netlib/SRC/DEPRECATED/cgeqpf.c b/lapack-netlib/SRC/DEPRECATED/cgeqpf.c index d68356d4ec..6c810fc795 100644 --- a/lapack-netlib/SRC/DEPRECATED/cgeqpf.c +++ b/lapack-netlib/SRC/DEPRECATED/cgeqpf.c @@ -661,7 +661,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqpf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeqpf_(integer *m, integer *n, complex *a, integer *lda, integer *jpvt, complex *tau, complex *work, real *rwork, integer * info) { @@ -674,21 +674,21 @@ f"> */ real temp, temp2; integer i__, j; real tol3z; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp; - extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqr2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int cunm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunm2r_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *); integer ma, mn; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *); extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); complex aii; integer pvt; @@ -725,8 +725,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("CGEQPF", &i__1); - return 0; + xerbla_("CGEQPF", &i__1, 6); + return; } mn = f2cmin(*m,*n); @@ -865,7 +865,7 @@ f"> */ /* L40: */ } } - return 0; + return; /* End of CGEQPF */ diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.c b/lapack-netlib/SRC/DEPRECATED/cggsvd.c index 31d0f66266..d6f7b5903e 100644 --- a/lapack-netlib/SRC/DEPRECATED/cggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.c @@ -849,7 +849,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggsvd_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void cggsvd_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, complex *a, integer * lda, complex *b, integer *ldb, real *alpha, real *beta, complex *u, integer *ldu, complex *v, integer *ldv, complex *q, integer *ldq, @@ -868,17 +868,18 @@ f"> */ extern logical lsame_(char *, char *); real anorm, bnorm; logical wantq; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantu, wantv; extern real clange_(char *, integer *, integer *, complex *, integer *, real *), slamch_(char *); - extern /* Subroutine */ int ctgsja_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctgsja_(char *, char *, char *, integer *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, real *, real *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, - integer *, integer *), xerbla_(char *, - integer *), cggsvp_(char *, char *, char *, integer *, + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cggsvp_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, real *, @@ -950,8 +951,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("CGGSVD", &i__1); - return 0; + xerbla_("CGGSVD", &i__1, 6); + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1011,7 +1012,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of CGGSVD */ diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvp.c b/lapack-netlib/SRC/DEPRECATED/cggsvp.c index 75cf645c09..d29338f730 100644 --- a/lapack-netlib/SRC/DEPRECATED/cggsvp.c +++ b/lapack-netlib/SRC/DEPRECATED/cggsvp.c @@ -774,7 +774,7 @@ f"> */ /* > a matrix. It may be replaced by a better rank determination strategy. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggsvp_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void cggsvp_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, complex *a, integer *lda, complex *b, integer *ldb, real *tola, real *tolb, integer *k, integer *l, complex *u, integer *ldu, complex *v, integer *ldv, complex *q, integer *ldq, @@ -790,7 +790,7 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqr2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *), cgerq2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *), cung2r_(integer *, integer *, integer *, complex *, integer *, @@ -802,8 +802,9 @@ f"> */ integer *, complex *, complex *, real *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, - complex *, complex *, integer *), xerbla_(char *, integer - *), clapmt_(logical *, integer *, integer *, complex *, + complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clapmt_(logical *, integer *, integer *, complex *, integer *, integer *); logical forwrd; @@ -872,8 +873,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("CGGSVP", &i__1); - return 0; + xerbla_("CGGSVP", &i__1, 6); + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1131,7 +1132,7 @@ f"> */ } - return 0; + return; /* End of CGGSVP */ diff --git a/lapack-netlib/SRC/DEPRECATED/clahrd.c b/lapack-netlib/SRC/DEPRECATED/clahrd.c index 0f90d28de4..ea62964f8e 100644 --- a/lapack-netlib/SRC/DEPRECATED/clahrd.c +++ b/lapack-netlib/SRC/DEPRECATED/clahrd.c @@ -684,7 +684,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clahrd_(integer *n, integer *k, integer *nb, complex *a, +/* Subroutine */ void clahrd_(integer *n, integer *k, integer *nb, complex *a, integer *lda, complex *tau, complex *t, integer *ldt, complex *y, integer *ldy) { @@ -695,7 +695,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), ccopy_(integer *, complex *, integer *, @@ -703,7 +703,7 @@ f"> */ integer *, complex *, integer *), ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); complex ei; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *), clacgv_(integer *, complex *, integer *); @@ -732,7 +732,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -856,7 +856,7 @@ f"> */ i__1 = *k + *nb + *nb * a_dim1; a[i__1].r = ei.r, a[i__1].i = ei.i; - return 0; + return; /* End of CLAHRD */ diff --git a/lapack-netlib/SRC/DEPRECATED/clatzm.c b/lapack-netlib/SRC/DEPRECATED/clatzm.c index 2b727ce6be..30393dc26f 100644 --- a/lapack-netlib/SRC/DEPRECATED/clatzm.c +++ b/lapack-netlib/SRC/DEPRECATED/clatzm.c @@ -666,7 +666,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int clatzm_(char *side, integer *m, integer *n, complex *v, +/* Subroutine */ void clatzm_(char *side, integer *m, integer *n, complex *v, integer *incv, complex *tau, complex *c1, complex *c2, integer *ldc, complex *work) { @@ -675,12 +675,12 @@ f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgeru_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, @@ -708,7 +708,7 @@ f"> */ /* Function Body */ if (f2cmin(*m,*n) == 0 || tau->r == 0.f && tau->i == 0.f) { - return 0; + return; } if (lsame_(side, "L")) { @@ -751,7 +751,7 @@ f"> */ ldc); } - return 0; + return; /* End of CLATZM */ diff --git a/lapack-netlib/SRC/DEPRECATED/ctzrqf.c b/lapack-netlib/SRC/DEPRECATED/ctzrqf.c index 7ac23fcad2..537e77dc68 100644 --- a/lapack-netlib/SRC/DEPRECATED/ctzrqf.c +++ b/lapack-netlib/SRC/DEPRECATED/ctzrqf.c @@ -652,7 +652,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctzrqf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void ctzrqf_(integer *m, integer *n, complex *a, integer *lda, complex *tau, integer *info) { /* System generated locals */ @@ -661,18 +661,18 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer m1; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *), clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -703,14 +703,14 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("CTZRQF", &i__1); - return 0; + xerbla_("CTZRQF", &i__1, 6); + return; } /* Perform the factorization. */ if (*m == 0) { - return 0; + return; } if (*m == *n) { i__1 = *n; @@ -783,7 +783,7 @@ f"> */ } } - return 0; + return; /* End of CTZRQF */ diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.c b/lapack-netlib/SRC/DEPRECATED/dgegs.c index f67d623a88..70fb1aa3b4 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgegs.c +++ b/lapack-netlib/SRC/DEPRECATED/dgegs.c @@ -742,7 +742,7 @@ rices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgegs_(char *jobvsl, char *jobvsr, integer *n, +/* Subroutine */ void dgegs_(char *jobvsl, char *jobvsr, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, integer *ldvsr, doublereal *work, @@ -761,45 +761,45 @@ rices */ integer iwork; logical ilvsr; integer irows; - extern /* Subroutine */ int dggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer nb; - extern /* Subroutine */ int dggbal_(char *, integer *, doublereal *, + extern /* Subroutine */ void dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer ijobvl, iright, ijobvr; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal anrmto; integer lwkmin, nb1, nb2, nb3; doublereal bnrmto; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal smlnum; @@ -904,16 +904,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("DGEGS ", &i__1); - return 0; + xerbla_("DGEGS ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -940,7 +940,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -961,7 +961,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1098,19 +1098,19 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } dlascl_("G", &c_n1, &c_n1, &anrmto, &anrm, n, &c__1, &alphar[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } dlascl_("G", &c_n1, &c_n1, &anrmto, &anrm, n, &c__1, &alphai[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1119,20 +1119,20 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } dlascl_("G", &c_n1, &c_n1, &bnrmto, &bnrm, n, &c__1, &beta[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } L10: work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGEGS */ diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.c b/lapack-netlib/SRC/DEPRECATED/dgegv.c index d5ba764630..c52b4b7d21 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/dgegv.c @@ -822,7 +822,7 @@ rices */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgegv_(char *jobvl, char *jobvr, integer *n, doublereal * +/* Subroutine */ void dgegv_(char *jobvl, char *jobvr, integer *n, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, doublereal *work, integer *lwork, @@ -842,11 +842,11 @@ rices */ doublereal anrm1, anrm2, bnrm1, bnrm2, absai, scale, absar, sbeta; extern logical lsame_(char *, char *); integer ileft, iinfo, icols, iwork, irows, jc; - extern /* Subroutine */ int dggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer nb; - extern /* Subroutine */ int dggbal_(char *, integer *, doublereal *, + extern /* Subroutine */ void dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer in; @@ -854,42 +854,42 @@ rices */ integer *, doublereal *, integer *, doublereal *); integer jr; doublereal salfai; - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal salfar; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal safmax; char chtemp[1]; logical ldumma[1]; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dtgevc_(char *, char *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - integer *, integer *, doublereal *, integer *), - xerbla_(char *, integer *); + integer *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ijobvl, iright; logical ilimit; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvr; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal onepls; integer lwkmin, nb1, nb2, nb3; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; @@ -997,16 +997,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("DGEGV ", &i__1); - return 0; + xerbla_("DGEGV ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1034,7 +1034,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1055,7 +1055,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1426,7 +1426,7 @@ rices */ L120: work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGEGV */ diff --git a/lapack-netlib/SRC/DEPRECATED/dgelsx.c b/lapack-netlib/SRC/DEPRECATED/dgelsx.c index ecc4c53e42..86977c94ee 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/dgelsx.c @@ -694,7 +694,7 @@ f"> */ /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dgelsx_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void dgelsx_(integer *m, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * jpvt, doublereal *rcond, integer *rank, doublereal *work, integer * info) @@ -707,32 +707,32 @@ f"> */ doublereal anrm, bnrm, smin, smax; integer i__, j, k, iascl, ibscl, ismin, ismax; doublereal c1, c2; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dlaic1_( integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal s1, s2, t1, t2; - extern /* Subroutine */ int dorm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dorm2r_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dlabad_( doublereal *, doublereal *); extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer mn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dgeqpf_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, - integer *); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dlatzm_(char *, integer *, integer *, + extern /* Subroutine */ void dlatzm_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *); doublereal sminpr, smaxpr, smlnum; - extern /* Subroutine */ int dtzrqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dtzrqf_(integer *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -781,8 +781,8 @@ f"> */ if (*info != 0) { i__1 = -(*info); - xerbla_("DGELSX", &i__1); - return 0; + xerbla_("DGELSX", &i__1, 6); + return; } /* Quick return if possible */ @@ -791,7 +791,7 @@ f"> */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*nrhs) == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -999,7 +999,7 @@ f"> */ L100: - return 0; + return; /* End of DGELSX */ diff --git a/lapack-netlib/SRC/DEPRECATED/dgeqpf.c b/lapack-netlib/SRC/DEPRECATED/dgeqpf.c index 550b61f560..94a4315235 100644 --- a/lapack-netlib/SRC/DEPRECATED/dgeqpf.c +++ b/lapack-netlib/SRC/DEPRECATED/dgeqpf.c @@ -655,7 +655,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqpf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqpf_(integer *m, integer *n, doublereal *a, integer * lda, integer *jpvt, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -668,11 +668,11 @@ f"> */ doublereal temp2; integer i__, j; doublereal tol3z; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dgeqr2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dorm2r_(char *, char *, integer *, integer *, integer *, @@ -681,10 +681,10 @@ f"> */ integer ma; extern doublereal dlamch_(char *); integer mn; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal aii; integer pvt; @@ -719,8 +719,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("DGEQPF", &i__1); - return 0; + xerbla_("DGEQPF", &i__1, 6); + return; } mn = f2cmin(*m,*n); @@ -854,7 +854,7 @@ f"> */ /* L40: */ } } - return 0; + return; /* End of DGEQPF */ diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvd.c b/lapack-netlib/SRC/DEPRECATED/dggsvd.c index 8a31fd2468..71a4010d48 100644 --- a/lapack-netlib/SRC/DEPRECATED/dggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/dggsvd.c @@ -845,7 +845,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggsvd_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void dggsvd_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *alpha, doublereal *beta, doublereal *u, integer *ldu, doublereal *v, integer @@ -864,17 +864,19 @@ f"> */ integer ncallmycycle, i__, j; extern logical lsame_(char *, char *); doublereal anorm, bnorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantq, wantu, wantv; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dtgsja_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtgsja_(char *, char *, char *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - integer *), xerbla_(char *, integer *), dggsvp_(char *, char *, char *, integer *, integer *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dggsvp_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, @@ -945,8 +947,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("DGGSVD", &i__1); - return 0; + xerbla_("DGGSVD", &i__1, 6); + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1007,7 +1009,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of DGGSVD */ diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvp.c b/lapack-netlib/SRC/DEPRECATED/dggsvp.c index a151f88407..593d7e4921 100644 --- a/lapack-netlib/SRC/DEPRECATED/dggsvp.c +++ b/lapack-netlib/SRC/DEPRECATED/dggsvp.c @@ -768,7 +768,7 @@ f"> */ /* > a matrix. It may be replaced by a better rank determination strategy. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggsvp_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void dggsvp_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *tola, doublereal *tolb, integer *k, integer *l, doublereal *u, integer *ldu, doublereal *v, integer *ldv, @@ -784,7 +784,7 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int dgeqr2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqr2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dgerq2_( integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dorg2r_(integer *, integer *, integer *, @@ -797,7 +797,9 @@ f"> */ integer *, integer *, doublereal *, doublereal *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, - integer *, doublereal *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *), dlapmt_(logical *, + integer *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlapmt_(logical *, integer *, integer *, doublereal *, integer *, integer *); logical forwrd; @@ -865,8 +867,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("DGGSVP", &i__1); - return 0; + xerbla_("DGGSVP", &i__1, 6); + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1115,7 +1117,7 @@ f"> */ } - return 0; + return; /* End of DGGSVP */ diff --git a/lapack-netlib/SRC/DEPRECATED/dlahrd.c b/lapack-netlib/SRC/DEPRECATED/dlahrd.c index 7d0800bc33..8b74436c4c 100644 --- a/lapack-netlib/SRC/DEPRECATED/dlahrd.c +++ b/lapack-netlib/SRC/DEPRECATED/dlahrd.c @@ -685,7 +685,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlahrd_(integer *n, integer *k, integer *nb, doublereal * +/* Subroutine */ void dlahrd_(integer *n, integer *k, integer *nb, doublereal * a, integer *lda, doublereal *tau, doublereal *t, integer *ldt, doublereal *y, integer *ldy) { @@ -696,7 +696,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, @@ -705,7 +705,7 @@ f"> */ *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal ei; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); @@ -734,7 +734,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -843,7 +843,7 @@ f"> */ } a[*k + *nb + *nb * a_dim1] = ei; - return 0; + return; /* End of DLAHRD */ diff --git a/lapack-netlib/SRC/DEPRECATED/dlatzm.c b/lapack-netlib/SRC/DEPRECATED/dlatzm.c index 3f0ab9fb19..39a00abf57 100644 --- a/lapack-netlib/SRC/DEPRECATED/dlatzm.c +++ b/lapack-netlib/SRC/DEPRECATED/dlatzm.c @@ -665,7 +665,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlatzm_(char *side, integer *m, integer *n, doublereal * +/* Subroutine */ void dlatzm_(char *side, integer *m, integer *n, doublereal * v, integer *incv, doublereal *tau, doublereal *c1, doublereal *c2, integer *ldc, doublereal *work) { @@ -674,11 +674,11 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer @@ -707,7 +707,7 @@ f"> */ /* Function Body */ if (f2cmin(*m,*n) == 0 || *tau == 0.) { - return 0; + return; } if (lsame_(side, "L")) { @@ -748,7 +748,7 @@ f"> */ ldc); } - return 0; + return; /* End of DLATZM */ diff --git a/lapack-netlib/SRC/DEPRECATED/dtzrqf.c b/lapack-netlib/SRC/DEPRECATED/dtzrqf.c index 69ee27578e..a5cf3dcfe9 100644 --- a/lapack-netlib/SRC/DEPRECATED/dtzrqf.c +++ b/lapack-netlib/SRC/DEPRECATED/dtzrqf.c @@ -652,7 +652,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtzrqf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dtzrqf_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, integer *info) { /* System generated locals */ @@ -660,19 +660,20 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, k; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *) ; integer m1; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, - integer *, doublereal *), xerbla_(char *, integer *); + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, + integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -703,14 +704,14 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("DTZRQF", &i__1); - return 0; + xerbla_("DTZRQF", &i__1, 6); + return; } /* Perform the factorization. */ if (*m == 0) { - return 0; + return; } if (*m == *n) { i__1 = *n; @@ -768,7 +769,7 @@ f"> */ } } - return 0; + return; /* End of DTZRQF */ diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.c b/lapack-netlib/SRC/DEPRECATED/sgegs.c index e238edab30..48cce1ede5 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgegs.c +++ b/lapack-netlib/SRC/DEPRECATED/sgegs.c @@ -742,7 +742,7 @@ ices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgegs_(char *jobvsl, char *jobvsr, integer *n, real *a, +/* Subroutine */ void sgegs_(char *jobvsl, char *jobvsr, integer *n, real *a, integer *lda, real *b, integer *ldb, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real * work, integer *lwork, integer *info) @@ -760,7 +760,7 @@ ices */ integer iwork; logical ilvsr; integer irows, nb; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -769,35 +769,35 @@ ices */ extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); real safmin; - extern /* Subroutine */ int sgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghrd_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * - , real *, integer *, integer *), xerbla_(char *, - integer *); + , real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer ijobvl, iright; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real anrmto; integer lwkmin, nb1, nb2, nb3; real bnrmto; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer ihi, ilo; @@ -899,16 +899,16 @@ ices */ if (*info != 0) { i__1 = -(*info); - xerbla_("SGEGS ", &i__1); - return 0; + xerbla_("SGEGS ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -935,7 +935,7 @@ ices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -956,7 +956,7 @@ ices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1093,19 +1093,19 @@ ices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } slascl_("G", &c_n1, &c_n1, &anrmto, &anrm, n, &c__1, &alphar[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } slascl_("G", &c_n1, &c_n1, &anrmto, &anrm, n, &c__1, &alphai[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1114,20 +1114,20 @@ ices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } slascl_("G", &c_n1, &c_n1, &bnrmto, &bnrm, n, &c__1, &beta[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } L10: work[1] = (real) lwkopt; - return 0; + return; /* End of SGEGS */ diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.c b/lapack-netlib/SRC/DEPRECATED/sgegv.c index 8ed81b4152..eb129ab65c 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/sgegv.c @@ -822,7 +822,7 @@ rices */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgegv_(char *jobvl, char *jobvr, integer *n, real *a, +/* Subroutine */ void sgegv_(char *jobvl, char *jobvr, integer *n, real *a, integer *lda, real *b, integer *ldb, real *alphar, real *alphai, real *beta, real *vl, integer *ldvl, real *vr, integer *ldvr, real *work, integer *lwork, integer *info) @@ -842,7 +842,7 @@ rices */ extern logical lsame_(char *, char *); integer ileft, iinfo, icols, iwork, irows, jc, nb, in, jr; real salfai; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -851,22 +851,23 @@ rices */ extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); real safmin; - extern /* Subroutine */ int sgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghrd_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, integer *); real safmax; char chtemp[1]; logical ldumma[1]; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, - real *, integer *, integer *, real *, integer *, integer *), xerbla_(char *, integer *); + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvl, iright; logical ilimit; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), stgevc_( char *, char *, logical *, integer *, real *, integer *, real *, @@ -874,7 +875,7 @@ rices */ integer *, real *, integer *); real onepls; integer lwkmin, nb1, nb2, nb3; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *), sorgqr_(integer *, @@ -882,7 +883,7 @@ rices */ , integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer ihi, ilo; @@ -988,16 +989,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("SGEGV ", &i__1); - return 0; + xerbla_("SGEGV ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1025,7 +1026,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1046,7 +1047,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1417,7 +1418,7 @@ rices */ L120: work[1] = (real) lwkopt; - return 0; + return; /* End of SGEGV */ diff --git a/lapack-netlib/SRC/DEPRECATED/sgelsx.c b/lapack-netlib/SRC/DEPRECATED/sgelsx.c index 78846b4758..a887910c9a 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/sgelsx.c @@ -694,7 +694,7 @@ f"> */ /* > \ingroup realGEsolve */ /* ===================================================================== */ -/* Subroutine */ int sgelsx_(integer *m, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sgelsx_(integer *m, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, integer *jpvt, real *rcond, integer *rank, real *work, integer *info) { @@ -706,7 +706,7 @@ f"> */ real anrm, bnrm, smin, smax; integer i__, j, k, iascl, ibscl, ismin, ismax; real c1, c2, s1, s2, t1, t2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), slaic1_(integer *, integer *, real *, real *, real *, real *, real *, real *, real *), sorm2r_( @@ -716,14 +716,14 @@ f"> */ integer mn; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgeqpf_(integer *, integer *, real *, integer *, integer *, real *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real sminpr, smaxpr, smlnum; - extern /* Subroutine */ int slatzm_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slatzm_(char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *), stzrqf_(integer *, integer *, real *, integer *, real *, integer * ); @@ -774,8 +774,8 @@ f"> */ if (*info != 0) { i__1 = -(*info); - xerbla_("SGELSX", &i__1); - return 0; + xerbla_("SGELSX", &i__1, 6); + return; } /* Quick return if possible */ @@ -784,7 +784,7 @@ f"> */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*nrhs) == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -992,7 +992,7 @@ f"> */ L100: - return 0; + return; /* End of SGELSX */ diff --git a/lapack-netlib/SRC/DEPRECATED/sgeqpf.c b/lapack-netlib/SRC/DEPRECATED/sgeqpf.c index 5831a03955..ac672f8047 100644 --- a/lapack-netlib/SRC/DEPRECATED/sgeqpf.c +++ b/lapack-netlib/SRC/DEPRECATED/sgeqpf.c @@ -655,7 +655,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqpf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqpf_(integer *m, integer *n, real *a, integer *lda, integer *jpvt, real *tau, real *work, integer *info) { /* System generated locals */ @@ -667,19 +667,20 @@ f"> */ extern real snrm2_(integer *, real *, integer *); integer i__, j; real tol3z; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer itemp; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), sgeqr2_(integer *, integer *, real *, integer *, real *, real *, integer *); integer ma; - extern /* Subroutine */ int sorm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorm2r_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *); integer mn; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *), slarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarfg_( integer *, real *, real *, integer *, real *); extern integer isamax_(integer *, real *, integer *); real aii; @@ -716,8 +717,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("SGEQPF", &i__1); - return 0; + xerbla_("SGEQPF", &i__1, 6); + return; } mn = f2cmin(*m,*n); @@ -851,7 +852,7 @@ f"> */ /* L40: */ } } - return 0; + return; /* End of SGEQPF */ diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvd.c b/lapack-netlib/SRC/DEPRECATED/sggsvd.c index 1281cca904..0af712876b 100644 --- a/lapack-netlib/SRC/DEPRECATED/sggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/sggsvd.c @@ -845,7 +845,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggsvd_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void sggsvd_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, real *a, integer *lda, real *b, integer *ldb, real *alpha, real *beta, real *u, integer * ldu, real *v, integer *ldv, real *q, integer *ldq, real *work, @@ -864,12 +864,13 @@ f"> */ extern logical lsame_(char *, char *); real anorm, bnorm; logical wantq; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantu, wantv; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int xerbla_(char *, integer *), stgsja_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stgsja_( char *, char *, char *, integer *, integer *, integer *, integer * , integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, real *, integer *, real *, @@ -944,8 +945,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("SGGSVD", &i__1); - return 0; + xerbla_("SGGSVD", &i__1, 6); + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1006,7 +1007,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of SGGSVD */ diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvp.c b/lapack-netlib/SRC/DEPRECATED/sggsvp.c index e6c51aa118..bf69f42391 100644 --- a/lapack-netlib/SRC/DEPRECATED/sggsvp.c +++ b/lapack-netlib/SRC/DEPRECATED/sggsvp.c @@ -768,7 +768,7 @@ f"> */ /* > a matrix. It may be replaced by a better rank determination strategy. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggsvp_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void sggsvp_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, real *a, integer *lda, real *b, integer *ldb, real *tola, real *tolb, integer *k, integer *l, real *u, integer *ldu, real *v, integer *ldv, real *q, integer *ldq, integer *iwork, real * @@ -783,13 +783,15 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqr2_(integer *, integer *, real *, integer *, real *, real *, integer *), sgerq2_(integer *, integer *, real *, integer *, real *, real *, integer *), sorg2r_(integer *, integer *, integer *, real *, integer *, real *, real *, integer * ), sorm2r_(char *, char *, integer *, integer *, integer *, real * , integer *, real *, real *, integer *, real *, integer *), sormr2_(char *, char *, integer *, integer *, integer *, - real *, integer *, real *, real *, integer *, real *, integer *), xerbla_(char *, integer *), sgeqpf_( + real *, integer *, real *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void sgeqpf_( integer *, integer *, real *, integer *, integer *, real *, real * , integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, @@ -861,8 +863,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("SGGSVP", &i__1); - return 0; + xerbla_("SGGSVP", &i__1, 6); + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1111,7 +1113,7 @@ f"> */ } - return 0; + return; /* End of SGGSVP */ diff --git a/lapack-netlib/SRC/DEPRECATED/slahrd.c b/lapack-netlib/SRC/DEPRECATED/slahrd.c index 2d60c82683..0cf290be2d 100644 --- a/lapack-netlib/SRC/DEPRECATED/slahrd.c +++ b/lapack-netlib/SRC/DEPRECATED/slahrd.c @@ -685,7 +685,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slahrd_(integer *n, integer *k, integer *nb, real *a, +/* Subroutine */ void slahrd_(integer *n, integer *k, integer *nb, real *a, integer *lda, real *tau, real *t, integer *ldt, real *y, integer *ldy) { /* System generated locals */ @@ -695,14 +695,14 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_( integer *, real *, integer *, real *, integer *), saxpy_(integer * , real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); real ei; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); @@ -731,7 +731,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -840,7 +840,7 @@ f"> */ } a[*k + *nb + *nb * a_dim1] = ei; - return 0; + return; /* End of SLAHRD */ diff --git a/lapack-netlib/SRC/DEPRECATED/slatzm.c b/lapack-netlib/SRC/DEPRECATED/slatzm.c index 9ae546bfa1..1826db1a4f 100644 --- a/lapack-netlib/SRC/DEPRECATED/slatzm.c +++ b/lapack-netlib/SRC/DEPRECATED/slatzm.c @@ -665,7 +665,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slatzm_(char *side, integer *m, integer *n, real *v, +/* Subroutine */ void slatzm_(char *side, integer *m, integer *n, real *v, integer *incv, real *tau, real *c1, real *c2, integer *ldc, real * work) { @@ -674,10 +674,10 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *); @@ -703,7 +703,7 @@ f"> */ /* Function Body */ if (f2cmin(*m,*n) == 0 || *tau == 0.f) { - return 0; + return; } if (lsame_(side, "L")) { @@ -744,7 +744,7 @@ f"> */ ldc); } - return 0; + return; /* End of SLATZM */ diff --git a/lapack-netlib/SRC/DEPRECATED/stzrqf.c b/lapack-netlib/SRC/DEPRECATED/stzrqf.c index 047d42da6d..16cf7e33aa 100644 --- a/lapack-netlib/SRC/DEPRECATED/stzrqf.c +++ b/lapack-netlib/SRC/DEPRECATED/stzrqf.c @@ -652,7 +652,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stzrqf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void stzrqf_(integer *m, integer *n, real *a, integer *lda, real *tau, integer *info) { /* System generated locals */ @@ -660,14 +660,16 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, k; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *); integer m1; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, - real *, integer *), xerbla_(char *, integer *), slarfg_( + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, + real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_( integer *, real *, real *, integer *, real *); @@ -699,14 +701,14 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("STZRQF", &i__1); - return 0; + xerbla_("STZRQF", &i__1, 6); + return; } /* Perform the factorization. */ if (*m == 0) { - return 0; + return; } if (*m == *n) { i__1 = *n; @@ -764,7 +766,7 @@ f"> */ } } - return 0; + return; /* End of STZRQF */ diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.c b/lapack-netlib/SRC/DEPRECATED/zgegs.c index 4c8a59d3c1..7dab0efe99 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgegs.c +++ b/lapack-netlib/SRC/DEPRECATED/zgegs.c @@ -740,7 +740,7 @@ rices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgegs_(char *jobvsl, char *jobvsr, integer *n, +/* Subroutine */ void zgegs_(char *jobvsl, char *jobvsr, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer *ldvsr, doublecomplex * @@ -760,34 +760,34 @@ rices */ logical ilvsr; integer irows, nb; extern doublereal dlamch_(char *); - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * , integer *, doublereal *, doublereal *, doublereal *, integer *); logical ilascl, ilbscl; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; integer ijobvl, iright; - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal anrmto; integer lwkmin, nb1, nb2, nb3; doublereal bnrmto; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -799,7 +799,7 @@ rices */ doublereal smlnum; integer irwork, lwkopt; logical lquery; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -903,16 +903,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("ZGEGS ", &i__1); - return 0; + xerbla_("ZGEGS ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -939,7 +939,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -960,7 +960,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1097,13 +1097,13 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } zlascl_("G", &c_n1, &c_n1, &anrmto, &anrm, n, &c__1, &alpha[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } @@ -1112,20 +1112,20 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } zlascl_("G", &c_n1, &c_n1, &bnrmto, &bnrm, n, &c__1, &beta[1], n, & iinfo); if (iinfo != 0) { *info = *n + 9; - return 0; + return; } } L10: work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZGEGS */ diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.c b/lapack-netlib/SRC/DEPRECATED/zgegv.c index e4238f51a6..991a6ff117 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgegv.c +++ b/lapack-netlib/SRC/DEPRECATED/zgegv.c @@ -799,7 +799,7 @@ rices */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgegv_(char *jobvl, char *jobvr, integer *n, +/* Subroutine */ void zgegv_(char *jobvl, char *jobvr, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, doublecomplex *work, integer @@ -823,13 +823,13 @@ rices */ extern doublereal dlamch_(char *); integer jr; doublereal salfai; - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * , integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal salfar, safmin; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal safmax; char chtemp[1]; logical ldumma[1]; @@ -839,18 +839,18 @@ rices */ integer *, doublereal *); integer ijobvl, iright; logical ilimit; - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); integer lwkmin, nb1, nb2, nb3; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), ztgevc_( @@ -864,7 +864,7 @@ rices */ integer *, doublecomplex *, integer *, doublereal *, integer *); integer irwork, lwkopt; logical lquery; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -972,16 +972,16 @@ rices */ if (*info != 0) { i__1 = -(*info); - xerbla_("ZGEGV ", &i__1); - return 0; + xerbla_("ZGEGV ", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1008,7 +1008,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1029,7 +1029,7 @@ rices */ iinfo); if (iinfo != 0) { *info = *n + 10; - return 0; + return; } } @@ -1356,7 +1356,7 @@ rices */ L80: work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZGEGV */ diff --git a/lapack-netlib/SRC/DEPRECATED/zgelsx.c b/lapack-netlib/SRC/DEPRECATED/zgelsx.c index 6a4cdbfe92..f1f39eb9c3 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgelsx.c +++ b/lapack-netlib/SRC/DEPRECATED/zgelsx.c @@ -700,7 +700,7 @@ f"> */ /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zgelsx_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void zgelsx_(integer *m, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *jpvt, doublereal *rcond, integer *rank, doublecomplex *work, doublereal *rwork, integer *info) @@ -713,7 +713,7 @@ f"> */ doublereal anrm, bnrm, smin, smax; integer i__, j, k, iascl, ibscl, ismin, ismax; doublecomplex c1, c2, s1, s2, t1, t2; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlaic1_(integer *, integer *, doublecomplex *, doublereal *, @@ -721,13 +721,14 @@ f"> */ doublecomplex *), dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); integer mn; - extern /* Subroutine */ int zunm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunm2r_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *); + doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zgeqpf_(integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, @@ -735,7 +736,7 @@ f"> */ integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal sminpr, smaxpr, smlnum; - extern /* Subroutine */ int zlatzm_(char *, integer *, integer *, + extern /* Subroutine */ void zlatzm_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), ztzrqf_( integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -788,8 +789,8 @@ f"> */ if (*info != 0) { i__1 = -(*info); - xerbla_("ZGELSX", &i__1); - return 0; + xerbla_("ZGELSX", &i__1, 6); + return; } /* Quick return if possible */ @@ -798,7 +799,7 @@ f"> */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*nrhs) == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1030,7 +1031,7 @@ f"> */ L100: - return 0; + return; /* End of ZGELSX */ diff --git a/lapack-netlib/SRC/DEPRECATED/zgeqpf.c b/lapack-netlib/SRC/DEPRECATED/zgeqpf.c index 72cbafff7d..12ecc99400 100644 --- a/lapack-netlib/SRC/DEPRECATED/zgeqpf.c +++ b/lapack-netlib/SRC/DEPRECATED/zgeqpf.c @@ -661,7 +661,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqpf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqpf_(integer *m, integer *n, doublecomplex *a, integer *lda, integer *jpvt, doublecomplex *tau, doublecomplex *work, doublereal *rwork, integer *info) { @@ -675,7 +675,7 @@ f"> */ integer i__, j; doublereal tol3z; integer itemp; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgeqr2_( @@ -685,11 +685,12 @@ f"> */ integer ma; extern doublereal dlamch_(char *); integer mn; - extern /* Subroutine */ int zunm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunm2r_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *), zlarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_( integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublecomplex aii; @@ -727,8 +728,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("ZGEQPF", &i__1); - return 0; + xerbla_("ZGEQPF", &i__1, 6); + return; } mn = f2cmin(*m,*n); @@ -867,7 +868,7 @@ f"> */ /* L40: */ } } - return 0; + return; /* End of ZGEQPF */ diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.c b/lapack-netlib/SRC/DEPRECATED/zggsvd.c index 3400a048bb..8f1c7e46c4 100644 --- a/lapack-netlib/SRC/DEPRECATED/zggsvd.c +++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.c @@ -848,7 +848,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggsvd_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void zggsvd_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *alpha, doublereal *beta, doublecomplex *u, integer *ldu, doublecomplex *v, @@ -867,14 +867,14 @@ f"> */ integer ncallmycycle, i__, j; extern logical lsame_(char *, char *); doublereal anorm, bnorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantq, wantu, wantv; extern doublereal dlamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int ztgsja_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztgsja_(char *, char *, char *, integer *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublecomplex *, integer *, @@ -953,8 +953,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("ZGGSVD", &i__1); - return 0; + xerbla_("ZGGSVD", &i__1, 6); + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1014,7 +1014,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of ZGGSVD */ diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvp.c b/lapack-netlib/SRC/DEPRECATED/zggsvp.c index 17e4601f3f..cbe9a9083e 100644 --- a/lapack-netlib/SRC/DEPRECATED/zggsvp.c +++ b/lapack-netlib/SRC/DEPRECATED/zggsvp.c @@ -777,7 +777,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggsvp_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void zggsvp_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *tola, doublereal *tolb, integer *k, integer *l, doublecomplex *u, integer *ldu, doublecomplex *v, integer @@ -793,7 +793,7 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int zgeqr2_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqr2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zgerq2_( integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zung2r_(integer *, integer *, @@ -802,14 +802,15 @@ f"> */ integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zunmr2_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *, integer *), xerbla_( - char *, integer *), zgeqpf_(integer *, integer *, + integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zgeqpf_(integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, doublecomplex *, doublereal *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical forwrd; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlapmt_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); @@ -878,8 +879,8 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("ZGGSVP", &i__1); - return 0; + xerbla_("ZGGSVP", &i__1, 6); + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1137,7 +1138,7 @@ f"> */ } - return 0; + return; /* End of ZGGSVP */ diff --git a/lapack-netlib/SRC/DEPRECATED/zlahrd.c b/lapack-netlib/SRC/DEPRECATED/zlahrd.c index 4a4e9860e7..3c86494cc6 100644 --- a/lapack-netlib/SRC/DEPRECATED/zlahrd.c +++ b/lapack-netlib/SRC/DEPRECATED/zlahrd.c @@ -684,7 +684,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlahrd_(integer *n, integer *k, integer *nb, +/* Subroutine */ void zlahrd_(integer *n, integer *k, integer *nb, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *t, integer *ldt, doublecomplex *y, integer *ldy) { @@ -695,7 +695,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), @@ -705,7 +705,7 @@ f"> */ char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ei; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *); @@ -735,7 +735,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -859,7 +859,7 @@ f"> */ i__1 = *k + *nb + *nb * a_dim1; a[i__1].r = ei.r, a[i__1].i = ei.i; - return 0; + return; /* End of ZLAHRD */ diff --git a/lapack-netlib/SRC/DEPRECATED/zlatzm.c b/lapack-netlib/SRC/DEPRECATED/zlatzm.c index 2884323926..b3c31d9941 100644 --- a/lapack-netlib/SRC/DEPRECATED/zlatzm.c +++ b/lapack-netlib/SRC/DEPRECATED/zlatzm.c @@ -666,7 +666,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlatzm_(char *side, integer *m, integer *n, +/* Subroutine */ void zlatzm_(char *side, integer *m, integer *n, doublecomplex *v, integer *incv, doublecomplex *tau, doublecomplex * c1, doublecomplex *c2, integer *ldc, doublecomplex *work) { @@ -676,7 +676,7 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -710,7 +710,7 @@ f"> */ /* Function Body */ if (f2cmin(*m,*n) == 0 || tau->r == 0. && tau->i == 0.) { - return 0; + return; } if (lsame_(side, "L")) { @@ -753,7 +753,7 @@ f"> */ ldc); } - return 0; + return; /* End of ZLATZM */ diff --git a/lapack-netlib/SRC/DEPRECATED/ztzrqf.c b/lapack-netlib/SRC/DEPRECATED/ztzrqf.c index 5d8e325bd0..222cd33c77 100644 --- a/lapack-netlib/SRC/DEPRECATED/ztzrqf.c +++ b/lapack-netlib/SRC/DEPRECATED/ztzrqf.c @@ -652,7 +652,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztzrqf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void ztzrqf_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, integer *info) { /* System generated locals */ @@ -662,16 +662,17 @@ f"> */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer m1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *), xerbla_( - char *, integer *), zlarfg_(integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *); @@ -704,14 +705,14 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("ZTZRQF", &i__1); - return 0; + xerbla_("ZTZRQF", &i__1, 6); + return; } /* Perform the factorization. */ if (*m == 0) { - return 0; + return; } if (*m == *n) { i__1 = *n; @@ -784,7 +785,7 @@ f"> */ } } - return 0; + return; /* End of ZTZRQF */ diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 03d15c23c3..8cac423305 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -94,7 +94,7 @@ SCLAUX = \ slagts.o slamrg.o slanst.o \ slapy2.o slapy3.o slarnv.o \ slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ - slarrk.o slarrr.o slaneg.o \ + slarrk.o slarrr.o slaneg.o slarmm.o \ slartg.o slaruv.o slas2.o slascl.o \ slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ @@ -116,7 +116,7 @@ DZLAUX = \ dlagts.o dlamrg.o dlanst.o \ dlapy2.o dlapy3.o dlarnv.o \ dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ - dlarrk.o dlarrr.o dlaneg.o \ + dlarrk.o dlarrr.o dlaneg.o dlarmm.o \ dlartg.o dlaruv.o dlas2.o dlascl.o \ dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ @@ -207,7 +207,7 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o + sgesvdq.o slatrs3.o strsyl3.o sgelst.o endif @@ -316,7 +316,7 @@ CLASRC_O = \ chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ - cgesvdq.o + cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o endif ifdef USEXBLAS @@ -417,7 +417,7 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o + dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o endif ifdef USEXBLAS @@ -526,7 +526,7 @@ ZLASRC_O = \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ - zgesvdq.o + zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o endif ifdef USEXBLAS @@ -572,22 +572,26 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ spotf2.o slaswp.o sgesv.o slauu2.o \ - slauum.o strti2.o strtri.o strtrs.o + slauum.o strti2.o strtri.o strtrs.o \ + ssymv.o ssyr.o sspmv.o sspr.o DLAPACKOBJS = \ dgetrf.o dgetrs.o dpotrf.o dgetf2.o \ dpotf2.o dlaswp.o dgesv.o dlauu2.o \ - dlauum.o dtrti2.o dtrtri.o dtrtrs.o + dlauum.o dtrti2.o dtrtri.o dtrtrs.o \ + dsymv.o dsyr.o dspmv.o dspr.o CLAPACKOBJS = \ cgetrf.o cgetrs.o cpotrf.o cgetf2.o \ cpotf2.o claswp.o cgesv.o clauu2.o \ - clauum.o ctrti2.o ctrtri.o ctrtrs.o + clauum.o ctrti2.o ctrtri.o ctrtrs.o \ + csymv.o csyr.o cspmv.o cspr.o ZLAPACKOBJS = \ zgetrf.o zgetrs.o zpotrf.o zgetf2.o \ zpotf2.o zlaswp.o zgesv.o zlauu2.o \ - zlauum.o ztrti2.o ztrtri.o ztrtrs.o + zlauum.o ztrti2.o ztrtri.o ztrtrs.o \ + zsymv.o zsyr.o zspmv.o zspr.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f index 369ed19833..46eaf33b99 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f @@ -81,7 +81,8 @@ C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ SUBROUTINE CGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ SUBROUTINE CGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'CGEQRF', -INFO ) @@ -252,7 +260,6 @@ SUBROUTINE CGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f index be5720f4f6..55cab8b238 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f @@ -81,7 +81,8 @@ C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ SUBROUTINE DGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ SUBROUTINE DGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'DGEQRF', -INFO ) @@ -252,7 +260,6 @@ SUBROUTINE DGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f index bff9732144..d2ad13ced1 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f @@ -81,7 +81,8 @@ C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ SUBROUTINE SGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ SUBROUTINE SGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'SGEQRF', -INFO ) @@ -252,7 +260,6 @@ SUBROUTINE SGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f index 79e86b41bb..623b88a8a7 100644 --- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f +++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f @@ -81,7 +81,8 @@ C> LWORK is INTEGER C> \endverbatim C> \verbatim -C> The dimension of the array WORK. The dimension can be divided into three parts. +C> The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0, +C> otherwise the dimension can be divided into three parts. C> \endverbatim C> \verbatim C> 1) The part for the triangular factor T. If the very last T is not bigger @@ -212,7 +213,13 @@ SUBROUTINE ZGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB)) LLWORK = SCEIL(REAL(LLWORK)/REAL(NB)) - IF ( NT.GT.NB ) THEN + IF( K.EQ.0 ) THEN + + LBWORK = 0 + LWKOPT = 1 + WORK( 1 ) = LWKOPT + + ELSE IF ( NT.GT.NB ) THEN LBWORK = K-NT * @@ -239,8 +246,9 @@ SUBROUTINE ZGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) INFO = -2 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 - ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN - INFO = -7 + ELSE IF ( .NOT.LQUERY ) THEN + IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) ) + $ INFO = -7 END IF IF( INFO.NE.0 ) THEN CALL XERBLA( 'ZGEQRF', -INFO ) @@ -252,7 +260,6 @@ SUBROUTINE ZGEQRF ( M, N, A, LDA, TAU, WORK, LWORK, INFO ) * Quick return if possible * IF( K.EQ.0 ) THEN - WORK( 1 ) = 1 RETURN END IF * diff --git a/lapack-netlib/SRC/cbbcsd.c b/lapack-netlib/SRC/cbbcsd.c index d3cbea59ce..5754ab80b2 100644 --- a/lapack-netlib/SRC/cbbcsd.c +++ b/lapack-netlib/SRC/cbbcsd.c @@ -849,7 +849,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void cbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, integer *m, integer *p, integer *q, real *theta, real *phi, complex *u1, integer *ldu1, complex *u2, integer *ldu2, complex *v1t, integer *ldv1t, complex *v2t, integer *ldv2t, real * @@ -869,14 +869,14 @@ f"> */ real thetamin, thetamax; logical restart11, restart12, restart21, restart22; integer iu1cs, iu2cs; - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; integer iu1sn, iu2sn, i__, j; real r__; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int clasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void clasr_(char *, char *, char *, integer *, integer *, real *, real *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); integer maxit; @@ -888,12 +888,12 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real thresh, tolmul; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery; real b11bulge; logical wantv1t, wantv2t; real b12bulge, b21bulge, b22bulge, eps, tol; - extern /* Subroutine */ int slartgp_(real *, real *, real *, real *, real + extern /* Subroutine */ void slartgp_(real *, real *, real *, real *, real *), slartgs_(real *, real *, real *, real *, real *); @@ -966,7 +966,7 @@ f"> */ if (*info == 0 && *q == 0) { lrworkmin = 1; rwork[1] = (real) lrworkmin; - return 0; + return; } /* Compute workspace */ @@ -991,9 +991,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CBBCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Get machine constants */ @@ -1086,7 +1086,7 @@ f"> */ ++(*info); } } - return 0; + return; } iter = iter + imax - imin; @@ -1800,7 +1800,7 @@ f"> */ } - return 0; + return; /* End of CBBCSD */ diff --git a/lapack-netlib/SRC/cbdsqr.c b/lapack-netlib/SRC/cbdsqr.c index 288053863f..c9b4d00983 100644 --- a/lapack-netlib/SRC/cbdsqr.c +++ b/lapack-netlib/SRC/cbdsqr.c @@ -742,7 +742,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cbdsqr_(char *uplo, integer *n, integer *ncvt, integer * +/* Subroutine */ void cbdsqr_(char *uplo, integer *n, integer *ncvt, integer * nru, integer *ncc, real *d__, real *e, complex *vt, integer *ldvt, complex *u, integer *ldu, complex *c__, integer *ldc, real *rwork, integer *info) @@ -761,23 +761,23 @@ f"> */ real cosl; integer isub, iter; real unfl, sinl, cosr, smin, smax, sinr; - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; real f, g, h__; integer i__, j, m; real r__; extern logical lsame_(char *, char *); real oldcs; - extern /* Subroutine */ int clasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void clasr_(char *, char *, char *, integer *, integer *, real *, real *, complex *, integer *); integer oldll; real shift, sigmn, oldsn; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer maxit; real sminl, sigmx; logical lower; - extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + extern /* Subroutine */ void csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *), slasq1_(integer *, real *, real *, real *, integer *), slasv2_(real *, real *, real *, real * , real *, real *, real *, real *, real *); @@ -785,10 +785,11 @@ f"> */ integer ll; real sn, mu; extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real sminoa; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ); real thresh; logical rotate; @@ -846,10 +847,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CBDSQR", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } if (*n == 1) { goto L160; @@ -867,7 +868,7 @@ f"> */ /* If INFO equals 2, dqds didn't finish, try to finish */ if (*info != 2) { - return 0; + return; } *info = 0; } @@ -1492,7 +1493,7 @@ f"> */ /* L210: */ } L220: - return 0; + return; /* End of CBDSQR */ diff --git a/lapack-netlib/SRC/cgbbrd.c b/lapack-netlib/SRC/cgbbrd.c index 61b9e9b50c..929f34d13b 100644 --- a/lapack-netlib/SRC/cgbbrd.c +++ b/lapack-netlib/SRC/cgbbrd.c @@ -712,7 +712,7 @@ f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgbbrd_(char *vect, integer *m, integer *n, integer *ncc, +/* Subroutine */ void cgbbrd_(char *vect, integer *m, integer *n, integer *ncc, integer *kl, integer *ku, complex *ab, integer *ldab, real *d__, real *e, complex *q, integer *ldq, complex *pt, integer *ldpt, complex *c__, integer *ldc, complex *work, real *rwork, integer *info) @@ -725,11 +725,11 @@ f"> */ /* Local variables */ integer inca; real abst; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer i__, j, l; complex t; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); logical wantb, wantc; @@ -742,10 +742,11 @@ f"> */ complex rb; integer ml, nr, mu; complex rs; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, - complex *, real *, complex *, complex *), xerbla_(char *, integer - *, ftnlen), clargv_(integer *, complex *, integer *, complex *, + complex *, real *, complex *, complex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clargv_(integer *, complex *, integer *, complex *, integer *, real *, integer *), clartv_(integer *, complex *, integer *, complex *, integer *, real *, complex *, integer *); integer kb1, ml0; @@ -813,7 +814,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBBRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and P**H to the unit matrix, if needed */ @@ -828,7 +829,7 @@ f"> */ /* Quick return if possible. */ if (*m == 0 || *n == 0) { - return 0; + return; } minmn = f2cmin(*m,*n); @@ -1237,7 +1238,7 @@ f"> */ } /* L120: */ } - return 0; + return; /* End of CGBBRD */ diff --git a/lapack-netlib/SRC/cgbcon.c b/lapack-netlib/SRC/cgbcon.c index e5bb9a9e8b..87565510f0 100644 --- a/lapack-netlib/SRC/cgbcon.c +++ b/lapack-netlib/SRC/cgbcon.c @@ -664,7 +664,7 @@ f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgbcon_(char *norm, integer *n, integer *kl, integer *ku, +/* Subroutine */ void cgbcon_(char *norm, integer *n, integer *kl, integer *ku, complex *ab, integer *ldab, integer *ipiv, real *anorm, real *rcond, complex *work, real *rwork, integer *info) { @@ -681,20 +681,20 @@ f"> */ *, complex *, integer *); extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical lnoti; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer kd, lm, jp, ix; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatbs_(char *, char *, char *, char *, integer *, integer *, complex *, integer *, complex *, real *, - real *, integer *), xerbla_(char * - , integer *, ftnlen); + real *, integer *); + extern int xerbla_(char * , integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + extern /* Subroutine */ void csrscl_(integer *, real *, complex *, integer *); logical onenrm; char normin[1]; @@ -739,7 +739,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -747,9 +747,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -863,7 +863,7 @@ f"> */ } L40: - return 0; + return; /* End of CGBCON */ diff --git a/lapack-netlib/SRC/cgbequ.c b/lapack-netlib/SRC/cgbequ.c index 18f6412046..149497aaf5 100644 --- a/lapack-netlib/SRC/cgbequ.c +++ b/lapack-netlib/SRC/cgbequ.c @@ -674,7 +674,7 @@ f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgbequ_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void cgbequ_(integer *m, integer *n, integer *kl, integer *ku, complex *ab, integer *ldab, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -725,7 +725,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -734,7 +734,7 @@ f"> */ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. */ @@ -795,7 +795,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -872,7 +872,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -895,7 +895,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of CGBEQU */ diff --git a/lapack-netlib/SRC/cgbequb.c b/lapack-netlib/SRC/cgbequb.c index 5de604ee64..caa9354999 100644 --- a/lapack-netlib/SRC/cgbequb.c +++ b/lapack-netlib/SRC/cgbequb.c @@ -681,7 +681,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgbequb_(integer *m, integer *n, integer *kl, integer * +/* Subroutine */ void cgbequb_(integer *m, integer *n, integer *kl, integer * ku, complex *ab, integer *ldab, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -732,7 +732,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGBEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -741,7 +741,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -811,7 +811,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -891,7 +891,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -914,7 +914,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of CGBEQUB */ diff --git a/lapack-netlib/SRC/cgbrfs.c b/lapack-netlib/SRC/cgbrfs.c index 3733312274..8b9ab20d97 100644 --- a/lapack-netlib/SRC/cgbrfs.c +++ b/lapack-netlib/SRC/cgbrfs.c @@ -730,7 +730,7 @@ f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgbrfs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void cgbrfs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, complex *ab, integer *ldab, complex *afb, integer * ldafb, integer *ipiv, complex *b, integer *ldb, complex *x, integer * ldx, real *ferr, real *berr, complex *work, real *rwork, integer * @@ -747,23 +747,24 @@ f"> */ real safe1, safe2; integer i__, j, k; real s; - extern /* Subroutine */ int cgbmv_(char *, integer *, integer *, integer * + extern /* Subroutine */ void cgbmv_(char *, integer *, integer *, integer * , integer *, complex *, complex *, integer *, complex *, integer * , complex *, complex *, integer *); extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer kk; real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgbtrs_( char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); logical notran; @@ -827,7 +828,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -839,7 +840,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1078,7 +1079,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CGBRFS */ diff --git a/lapack-netlib/SRC/cgbsv.c b/lapack-netlib/SRC/cgbsv.c index c5fcd2f82e..2bc88cc762 100644 --- a/lapack-netlib/SRC/cgbsv.c +++ b/lapack-netlib/SRC/cgbsv.c @@ -684,7 +684,7 @@ e driver) */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgbsv_(integer *n, integer *kl, integer *ku, integer * +/* Subroutine */ void cgbsv_(integer *n, integer *kl, integer *ku, integer * nrhs, complex *ab, integer *ldab, integer *ipiv, complex *b, integer * ldb, integer *info) { @@ -692,9 +692,10 @@ e driver) */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int cgbtrf_(integer *, integer *, integer *, - integer *, complex *, integer *, integer *, integer *), xerbla_( - char *, integer *, ftnlen), cgbtrs_(char *, integer *, integer *, + extern /* Subroutine */ void cgbtrf_(integer *, integer *, integer *, + integer *, complex *, integer *, integer *, integer *); + extern int xerbla_( char *, integer *, ftnlen); + extern void cgbtrs_(char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -737,7 +738,7 @@ e driver) */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the LU factorization of the band matrix A. */ @@ -750,7 +751,7 @@ e driver) */ cgbtrs_("No transpose", n, kl, ku, nrhs, &ab[ab_offset], ldab, &ipiv[ 1], &b[b_offset], ldb, info); } - return 0; + return; /* End of CGBSV */ diff --git a/lapack-netlib/SRC/cgbsvx.c b/lapack-netlib/SRC/cgbsvx.c index 0712d114cc..92118b3929 100644 --- a/lapack-netlib/SRC/cgbsvx.c +++ b/lapack-netlib/SRC/cgbsvx.c @@ -893,7 +893,7 @@ f"> */ /* > \ingroup complexGBsolve */ /* ===================================================================== */ -/* Subroutine */ int cgbsvx_(char *fact, char *trans, integer *n, integer *kl, +/* Subroutine */ void cgbsvx_(char *fact, char *trans, integer *n, integer *kl, integer *ku, integer *nrhs, complex *ab, integer *ldab, complex *afb, integer *ldafb, integer *ipiv, char *equed, real *r__, real *c__, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real @@ -911,13 +911,13 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); real rcmin, rcmax, anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical equil; integer j1, j2; extern real clangb_(char *, integer *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int claqgb_(integer *, integer *, integer *, + extern /* Subroutine */ void claqgb_(integer *, integer *, integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, char *), cgbcon_(char *, integer *, integer *, integer *, complex *, integer *, integer *, real *, real *, @@ -925,22 +925,22 @@ f"> */ real colcnd; extern real clantb_(char *, char *, char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int cgbequ_(integer *, integer *, integer *, + extern /* Subroutine */ void cgbequ_(integer *, integer *, integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int cgbrfs_(char *, integer *, integer *, integer + extern /* Subroutine */ void cgbrfs_(char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *), cgbtrf_(integer *, integer *, integer *, integer *, complex *, integer *, integer *, integer *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int cgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void cgbtrs_(char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer infequ; @@ -1080,7 +1080,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1195,7 +1195,7 @@ f"> */ } rwork[1] = rpvgrw; *rcond = 0.f; - return 0; + return; } } @@ -1289,7 +1289,7 @@ f"> */ } rwork[1] = rpvgrw; - return 0; + return; /* End of CGBSVX */ diff --git a/lapack-netlib/SRC/cgbsvxx.c b/lapack-netlib/SRC/cgbsvxx.c index 50804b1795..7e0142b462 100644 --- a/lapack-netlib/SRC/cgbsvxx.c +++ b/lapack-netlib/SRC/cgbsvxx.c @@ -1080,7 +1080,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexGBsolve */ /* ===================================================================== */ -/* Subroutine */ int cgbsvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void cgbsvxx_(char *fact, char *trans, integer *n, integer * kl, integer *ku, integer *nrhs, complex *ab, integer *ldab, complex * afb, integer *ldafb, integer *ipiv, char *equed, real *r__, real *c__, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, @@ -1102,19 +1102,19 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); real rcmin, rcmax; logical equil; - extern /* Subroutine */ int claqgb_(integer *, integer *, integer *, + extern /* Subroutine */ void claqgb_(integer *, integer *, integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, char *); real colcnd; extern real slamch_(char *); - extern /* Subroutine */ int cgbtrf_(integer *, integer *, integer *, + extern /* Subroutine */ void cgbtrf_(integer *, integer *, integer *, integer *, complex *, integer *, integer *, integer *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int cgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void cgbtrs_(char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer infequ; @@ -1123,7 +1123,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical notran; real smlnum; logical rowequ; - extern /* Subroutine */ int clascl2_(integer *, integer *, real *, + extern /* Subroutine */ void clascl2_(integer *, integer *, real *, complex *, integer *), cgbequb_(integer *, integer *, integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, integer *), cgbrfsx_(char *, char *, integer *, integer *, @@ -1270,7 +1270,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGBSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1346,7 +1346,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = cla_gbrpvgrw_(n, kl, ku, info, &ab[ab_offset], ldab, & afb[afb_offset], ldafb); - return 0; + return; } } @@ -1379,7 +1379,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ clascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of CGBSVXX */ diff --git a/lapack-netlib/SRC/cgbtf2.c b/lapack-netlib/SRC/cgbtf2.c index 1e2e885357..e09126c80d 100644 --- a/lapack-netlib/SRC/cgbtf2.c +++ b/lapack-netlib/SRC/cgbtf2.c @@ -672,7 +672,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgbtf2_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void cgbtf2_(integer *m, integer *n, integer *kl, integer *ku, complex *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cswap_( integer *, complex *, integer *, complex *, integer *); @@ -728,13 +728,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Gaussian elimination with partial pivoting */ @@ -827,7 +827,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of CGBTF2 */ diff --git a/lapack-netlib/SRC/cgbtrf.c b/lapack-netlib/SRC/cgbtrf.c index 8bfcdf5264..36a782d540 100644 --- a/lapack-netlib/SRC/cgbtrf.c +++ b/lapack-netlib/SRC/cgbtrf.c @@ -671,7 +671,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgbtrf_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void cgbtrf_(integer *m, integer *n, integer *kl, integer *ku, complex *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ /* Local variables */ complex temp; integer i__, j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *), cgeru_(integer *, @@ -691,11 +691,11 @@ f"> */ complex *, integer *); complex work13[4160] /* was [65][64] */, work31[4160] /* was [65][64] */; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer i2, i3, j2, j3, k2; - extern /* Subroutine */ int cgbtf2_(integer *, integer *, integer *, + extern /* Subroutine */ void cgbtf2_(integer *, integer *, integer *, integer *, complex *, integer *, integer *, integer *); integer jb, nb, ii, jj, jm, ip, jp, km, ju, kv; extern integer icamax_(integer *, complex *, integer *); @@ -745,13 +745,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1162,7 +1162,7 @@ f"> */ } } - return 0; + return; /* End of CGBTRF */ diff --git a/lapack-netlib/SRC/cgbtrs.c b/lapack-netlib/SRC/cgbtrs.c index df1ea67eea..86df26e23c 100644 --- a/lapack-netlib/SRC/cgbtrs.c +++ b/lapack-netlib/SRC/cgbtrs.c @@ -663,7 +663,7 @@ f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgbtrs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void cgbtrs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, complex *ab, integer *ldab, integer *ipiv, complex *b, integer *ldb, integer *info) { @@ -674,7 +674,7 @@ f"> */ /* Local variables */ integer i__, j, l; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -683,8 +683,8 @@ f"> */ integer *, complex *, integer *); logical lnoti; integer kd, lm; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; @@ -730,13 +730,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } kd = *ku + *kl + 1; @@ -851,7 +851,7 @@ f"> */ } } } - return 0; + return; /* End of CGBTRS */ diff --git a/lapack-netlib/SRC/cgebak.c b/lapack-netlib/SRC/cgebak.c index c406a3d391..a40b10ba2a 100644 --- a/lapack-netlib/SRC/cgebak.c +++ b/lapack-netlib/SRC/cgebak.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgebak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void cgebak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, real *scale, integer *m, complex *v, integer *ldv, integer *info) { @@ -662,12 +662,13 @@ f"> */ integer i__, k; real s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical leftv; integer ii; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); logical rightv; @@ -712,19 +713,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -803,7 +804,7 @@ f"> */ } } - return 0; + return; /* End of CGEBAK */ diff --git a/lapack-netlib/SRC/cgebak.f b/lapack-netlib/SRC/cgebak.f index 201dbfcec4..4348d5ea47 100644 --- a/lapack-netlib/SRC/cgebak.f +++ b/lapack-netlib/SRC/cgebak.f @@ -238,7 +238,7 @@ SUBROUTINE CGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -252,7 +252,7 @@ SUBROUTINE CGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/cgebal.c b/lapack-netlib/SRC/cgebal.c index 45e8b0f4ec..26e8279448 100644 --- a/lapack-netlib/SRC/cgebal.c +++ b/lapack-netlib/SRC/cgebal.c @@ -686,7 +686,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgebal_(char *job, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgebal_(char *job, integer *n, complex *a, integer *lda, integer *ilo, integer *ihi, real *scale, integer *info) { /* System generated locals */ @@ -699,15 +699,16 @@ f"> */ integer i__, j, k, l, m; real r__, s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); real sfmin1, sfmin2, sfmax1, sfmax2, ca; extern real scnrm2_(integer *, complex *, integer *); real ra; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); extern logical sisnan_(real *); logical noconv; integer ica, ira; @@ -743,7 +744,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEBAL", &i__1, (ftnlen)6); - return 0; + return; } k = 1; @@ -909,7 +910,7 @@ f"> */ *info = -3; i__2 = -(*info); xerbla_("CGEBAL", &i__2, (ftnlen)6); - return 0; + return; } f *= 2.f; c__ *= 2.f; @@ -971,7 +972,7 @@ f"> */ *ilo = k; *ihi = l; - return 0; + return; /* End of CGEBAL */ diff --git a/lapack-netlib/SRC/cgebd2.c b/lapack-netlib/SRC/cgebd2.c index 6f8229b2fb..6beb67a560 100644 --- a/lapack-netlib/SRC/cgebd2.c +++ b/lapack-netlib/SRC/cgebd2.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgebd2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgebd2_(integer *m, integer *n, complex *a, integer *lda, real *d__, real *e, complex *tauq, complex *taup, complex *work, integer *info) { @@ -726,11 +726,11 @@ f"> */ /* Local variables */ integer i__; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), clarfg_(integer *, complex *, complex *, integer *, complex *), - clacgv_(integer *, complex *, integer *), xerbla_(char *, integer - *, ftnlen); + clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.1) -- */ @@ -766,7 +766,7 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("CGEBD2", &i__1, (ftnlen)6); - return 0; + return; } if (*m >= *n) { @@ -912,7 +912,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of CGEBD2 */ diff --git a/lapack-netlib/SRC/cgebrd.c b/lapack-netlib/SRC/cgebrd.c index 6389348a72..e474cdac31 100644 --- a/lapack-netlib/SRC/cgebrd.c +++ b/lapack-netlib/SRC/cgebrd.c @@ -722,7 +722,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgebrd_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgebrd_(integer *m, integer *n, complex *a, integer *lda, real *d__, real *e, complex *tauq, complex *taup, complex *work, integer *lwork, integer *info) { @@ -733,15 +733,15 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer nbmin, iinfo, minmn; - extern /* Subroutine */ int cgebd2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgebd2_(integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *); integer nb; - extern /* Subroutine */ int clabrd_(integer *, integer *, integer *, + extern /* Subroutine */ void clabrd_(integer *, integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *, complex *, integer *); integer nx, ws; @@ -799,9 +799,9 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("CGEBRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -809,7 +809,7 @@ f"> */ minmn = f2cmin(*m,*n); if (minmn == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } ws = f2cmax(*m,*n); @@ -914,7 +914,7 @@ f"> */ cgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], & tauq[i__], &taup[i__], &work[1], &iinfo); work[1].r = (real) ws, work[1].i = 0.f; - return 0; + return; /* End of CGEBRD */ diff --git a/lapack-netlib/SRC/cgecon.c b/lapack-netlib/SRC/cgecon.c index 4dd42754a7..dd67491bec 100644 --- a/lapack-netlib/SRC/cgecon.c +++ b/lapack-netlib/SRC/cgecon.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgecon_(char *norm, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgecon_(char *norm, integer *n, complex *a, integer *lda, real *anorm, real *rcond, complex *work, real *rwork, integer *info) { /* System generated locals */ @@ -647,7 +647,7 @@ f"> */ real scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real sl; integer ix; @@ -656,7 +656,7 @@ f"> */ real su; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real *, real *, integer *), csrscl_(integer *, real *, complex *, integer *); @@ -698,7 +698,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGECON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -706,9 +706,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -773,7 +773,7 @@ f"> */ } L20: - return 0; + return; /* End of CGECON */ diff --git a/lapack-netlib/SRC/cgecon.f b/lapack-netlib/SRC/cgecon.f index 48f409b680..6f426c2ab6 100644 --- a/lapack-netlib/SRC/cgecon.f +++ b/lapack-netlib/SRC/cgecon.f @@ -106,6 +106,7 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value +*> =-5: if ANORM is NAN or negative. *> \endverbatim * * Authors: @@ -153,10 +154,10 @@ SUBROUTINE CGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, INTEGER ISAVE( 3 ) * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN INTEGER ICAMAX REAL SLAMCH - EXTERNAL LSAME, ICAMAX, SLAMCH + EXTERNAL LSAME, ICAMAX, SLAMCH, SISNAN * .. * .. External Subroutines .. EXTERNAL CLACN2, CLATRS, CSRSCL, XERBLA @@ -182,7 +183,7 @@ SUBROUTINE CGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO ) THEN + ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/cgeequ.c b/lapack-netlib/SRC/cgeequ.c index dfa45c381e..466b9290d4 100644 --- a/lapack-netlib/SRC/cgeequ.c +++ b/lapack-netlib/SRC/cgeequ.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgeequ_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeequ_(integer *m, integer *n, complex *a, integer *lda, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -694,7 +694,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -703,7 +703,7 @@ f"> */ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. */ @@ -759,7 +759,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -830,7 +830,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -853,7 +853,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of CGEEQU */ diff --git a/lapack-netlib/SRC/cgeequb.c b/lapack-netlib/SRC/cgeequb.c index 788d4c8410..4834039305 100644 --- a/lapack-netlib/SRC/cgeequb.c +++ b/lapack-netlib/SRC/cgeequb.c @@ -656,7 +656,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgeequb_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgeequb_(integer *m, integer *n, complex *a, integer * lda, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -702,7 +702,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGEEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -711,7 +711,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -776,7 +776,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -851,7 +851,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -874,7 +874,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of CGEEQUB */ diff --git a/lapack-netlib/SRC/cgees.c b/lapack-netlib/SRC/cgees.c index 7b0f60880c..a0b44c4b3a 100644 --- a/lapack-netlib/SRC/cgees.c +++ b/lapack-netlib/SRC/cgees.c @@ -710,7 +710,7 @@ or GE matrices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgees_(char *jobvs, char *sort, L_fp select, integer *n, +/* Subroutine */ void cgees_(char *jobvs, char *sort, L_fp select, integer *n, complex *a, integer *lda, integer *sdim, complex *w, complex *vs, integer *ldvs, complex *work, integer *lwork, real *rwork, logical * bwork, integer *info) @@ -725,7 +725,7 @@ or GE matrices */ real s; integer icond, ieval; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, complex *, integer *, integer *), cgebal_(char *, integer *, complex *, integer *, integer *, integer *, real *, integer *), slabad_(real *, @@ -734,18 +734,18 @@ or GE matrices */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); real cscale; - extern /* Subroutine */ int cgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void cgehrd_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int chseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chseqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunghr_(integer *, integer *, integer *, complex *, integer *, complex *, complex @@ -844,16 +844,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEES ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -971,7 +971,7 @@ or GE matrices */ } work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGEES */ diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f index 359fa2afec..71acfdba3b 100644 --- a/lapack-netlib/SRC/cgees.f +++ b/lapack-netlib/SRC/cgees.f @@ -282,7 +282,7 @@ SUBROUTINE CGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, W, VS, * CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = REAL( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/cgeesx.c b/lapack-netlib/SRC/cgeesx.c index 46ab3631fa..848125f1e9 100644 --- a/lapack-netlib/SRC/cgeesx.c +++ b/lapack-netlib/SRC/cgeesx.c @@ -752,7 +752,7 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgeesx_(char *jobvs, char *sort, L_fp select, char * +/* Subroutine */ void cgeesx_(char *jobvs, char *sort, L_fp select, char * sense, integer *n, complex *a, integer *lda, integer *sdim, complex * w, complex *vs, integer *ldvs, real *rconde, real *rcondv, complex * work, integer *lwork, real *rwork, logical *bwork, integer *info) @@ -765,7 +765,7 @@ f"> */ real anrm; integer ierr, itau, iwrk, lwrk, i__, icond, ieval; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, complex *, integer *, integer *), cgebal_(char *, integer *, complex *, integer *, integer *, integer *, real *, integer *), slabad_(real *, @@ -774,25 +774,25 @@ f"> */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); real cscale; - extern /* Subroutine */ int cgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void cgehrd_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), chseqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunghr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); logical wantsb; - extern /* Subroutine */ int ctrsen_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ctrsen_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, integer *, integer *); logical wantse; @@ -906,16 +906,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1054,7 +1054,7 @@ f"> */ } work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGEESX */ diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f index 1113563ba2..782e367475 100644 --- a/lapack-netlib/SRC/cgeesx.f +++ b/lapack-netlib/SRC/cgeesx.f @@ -337,7 +337,7 @@ SUBROUTINE CGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, W, * CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = REAL( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/cgeev.c b/lapack-netlib/SRC/cgeev.c index ba4338b1dd..b5022234bf 100644 --- a/lapack-netlib/SRC/cgeev.c +++ b/lapack-netlib/SRC/cgeev.c @@ -695,7 +695,7 @@ ices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgeev_(char *jobvl, char *jobvr, integer *n, complex *a, +/* Subroutine */ void cgeev_(char *jobvl, char *jobvr, integer *n, complex *a, integer *lda, complex *w, complex *vl, integer *ldvl, complex *vr, integer *ldvr, complex *work, integer *lwork, real *rwork, integer * info) @@ -711,11 +711,11 @@ ices */ char side[1]; real anrm; integer ierr, itau, iwrk, nout, i__, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int cgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, complex *, integer *, integer *), cgebal_(char *, integer *, complex *, integer *, integer *, integer *, real *, integer *), slabad_(real *, real *); @@ -723,20 +723,21 @@ ices */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); real cscale; - extern /* Subroutine */ int cgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void cgehrd_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, - complex *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical select[1]; real bignum; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int chseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chseqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunghr_(integer *, integer *, integer *, complex *, integer *, complex *, complex @@ -746,7 +747,7 @@ ices */ real smlnum; integer hswork, irwork; logical lquery, wantvr; - extern /* Subroutine */ int ctrevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ctrevc3_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, real *, integer *, integer *); @@ -868,15 +869,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1111,7 +1112,7 @@ ices */ } work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGEEV */ diff --git a/lapack-netlib/SRC/cgeevx.c b/lapack-netlib/SRC/cgeevx.c index d4e6a2e5dd..1d59f2c703 100644 --- a/lapack-netlib/SRC/cgeevx.c +++ b/lapack-netlib/SRC/cgeevx.c @@ -802,7 +802,7 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgeevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void cgeevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, complex *a, integer *lda, complex *w, complex *vl, integer *ldvl, complex *vr, integer *ldvr, integer *ilo, integer *ihi, real *scale, real *abnrm, real *rconde, real *rcondv, complex *work, @@ -818,12 +818,12 @@ f"> */ char side[1]; real anrm; integer ierr, itau, iwrk, nout, i__, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); integer icond; extern logical lsame_(char *, char *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int cgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, complex *, integer *, integer *), cgebal_(char *, integer *, complex *, integer *, integer *, integer *, real *, integer *), slabad_(real *, real *); @@ -831,22 +831,23 @@ f"> */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); real cscale; - extern /* Subroutine */ int cgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void cgehrd_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, - complex *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical select[1]; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int chseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chseqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunghr_(integer *, integer *, integer *, complex *, integer *, complex *, complex @@ -860,7 +861,7 @@ f"> */ logical wntsne; real smlnum; logical lquery, wantvr, wntsnn, wntsnv; - extern /* Subroutine */ int ctrevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ctrevc3_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, real *, integer *, integer *); @@ -1019,15 +1020,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1285,7 +1286,7 @@ f"> */ } work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGEEVX */ diff --git a/lapack-netlib/SRC/cgehd2.c b/lapack-netlib/SRC/cgehd2.c index 329b3c1b2a..16d4fd76e9 100644 --- a/lapack-netlib/SRC/cgehd2.c +++ b/lapack-netlib/SRC/cgehd2.c @@ -664,7 +664,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgehd2_(integer *n, integer *ilo, integer *ihi, complex * +/* Subroutine */ void cgehd2_(integer *n, integer *ilo, integer *ihi, complex * a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -674,10 +674,10 @@ f"> */ /* Local variables */ integer i__; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), - clarfg_(integer *, complex *, complex *, integer *, complex *), - xerbla_(char *, integer *, ftnlen); + clarfg_(integer *, complex *, complex *, integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -712,7 +712,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEHD2", &i__1, (ftnlen)6); - return 0; + return; } i__1 = *ihi - 1; @@ -749,7 +749,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of CGEHD2 */ diff --git a/lapack-netlib/SRC/cgehrd.c b/lapack-netlib/SRC/cgehrd.c index 6706ff60d4..86f3705a13 100644 --- a/lapack-netlib/SRC/cgehrd.c +++ b/lapack-netlib/SRC/cgehrd.c @@ -686,7 +686,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgehrd_(integer *n, integer *ilo, integer *ihi, complex * +/* Subroutine */ void cgehrd_(integer *n, integer *ilo, integer *ihi, complex * a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { @@ -696,11 +696,11 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer nbmin, iinfo; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), cgehd2_( @@ -711,7 +711,7 @@ f"> */ integer ib; complex ei; integer nb, nh; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; @@ -771,9 +771,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEHRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ @@ -796,7 +796,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Determine the block size */ @@ -913,7 +913,7 @@ f"> */ cgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo); work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CGEHRD */ diff --git a/lapack-netlib/SRC/cgejsv.c b/lapack-netlib/SRC/cgejsv.c index ca5a5f23ae..e4bfd86aee 100644 --- a/lapack-netlib/SRC/cgejsv.c +++ b/lapack-netlib/SRC/cgejsv.c @@ -1086,7 +1086,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgejsv_(char *joba, char *jobu, char *jobv, char *jobr, +/* Subroutine */ void cgejsv_(char *joba, char *jobu, char *jobv, char *jobr, char *jobt, char *jobp, integer *m, integer *n, complex *a, integer * lda, real *sva, complex *u, integer *ldu, complex *v, integer *ldv, complex *cwork, integer *lwork, real *rwork, integer *lrwork, integer @@ -1109,24 +1109,24 @@ f"> */ p, q; logical jracc; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); complex ctemp; real entra, small; integer iwoff; real sfmin; logical lsvec; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); real epsln; logical rsvec; integer lwcon, lwlqf; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer lwqrf, n1; logical l2aber; - extern /* Subroutine */ int cgeqp3_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqp3_(integer *, integer *, complex *, integer *, integer *, complex *, complex *, integer *, real *, integer *); real condr1, condr2, uscal1, uscal2; @@ -1134,53 +1134,54 @@ f"> */ extern real scnrm2_(integer *, complex *, integer *); logical l2pert; integer lrwqp3; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); integer nr; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern integer icamax_(integer *, complex *, integer *); - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); real scalem, sconda; logical goscal; real aatmin; extern real slamch_(char *); real aatmax; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, integer *), clapmr_(logical *, integer *, integer *, complex *, integer *, integer *); logical noscal; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), cpocon_(char *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *), csscal_(integer *, real *, complex *, integer *), classq_(integer *, complex *, - integer *, real *, real *), xerbla_(char *, integer *, ftnlen), - cgesvj_(char *, char *, char *, integer *, integer *, complex *, + integer *, real *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgesvj_(char *, char *, char *, integer *, integer *, complex *, integer *, real *, integer *, complex *, integer *, complex *, - integer *, real *, integer *, integer *), - claswp_(integer *, complex *, integer *, integer *, integer *, + integer *, real *, integer *, integer *); + extern int claswp_(integer *, complex *, integer *, integer *, integer *, integer *, integer *); real entrat; logical almort; complex cdummy[1]; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real maxprj; - extern /* Subroutine */ int cunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmlq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); logical errest; integer lrwcon; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); logical transp; integer minwrk, lwsvdj; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); real rdummy[1]; @@ -1784,13 +1785,13 @@ f"> */ /* #:( */ i__1 = -(*info); xerbla_("CGEJSV", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { cwork[1].r = (real) optwrk, cwork[1].i = 0.f; cwork[2].r = (real) minwrk, cwork[2].i = 0.f; rwork[1] = (real) minrwrk; iwork[1] = f2cmax(4,miniwrk); - return 0; + return; } /* Quick return for void matrix (Y3K safe) */ @@ -1807,7 +1808,7 @@ f"> */ rwork[5] = 0.f; rwork[6] = 0.f; rwork[7] = 0.f; - return 0; + return; } /* Determine whether the matrix U should be M x N or M x M */ @@ -1847,7 +1848,7 @@ f"> */ *info = -9; i__2 = -(*info); xerbla_("CGEJSV", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscal) { @@ -1909,7 +1910,7 @@ f"> */ iwork[2] = 0; iwork[3] = 0; iwork[4] = -1; - return 0; + return; } /* Issue warning if denormalized column norms detected. Override the */ @@ -1976,7 +1977,7 @@ f"> */ rwork[6] = 0.f; rwork[7] = 0.f; } - return 0; + return; } @@ -3547,6 +3548,6 @@ f"> */ iwork[4] = -1; } - return 0; + return; } /* cgejsv_ */ diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f index 25ab813028..062ac182b1 100644 --- a/lapack-netlib/SRC/cgejsv.f +++ b/lapack-netlib/SRC/cgejsv.f @@ -304,7 +304,7 @@ *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance, *> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, -*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQ, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF, *> CUNMLQ. In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(CGEQP3), N+LWORK(CGESVJ), *> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). @@ -313,7 +313,7 @@ *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance, *> LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB, -*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQ, +*> where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF, *> CUNMLQ. In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ), *> N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)). @@ -350,7 +350,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is REAL array, dimension (MAX(7,LWORK)) +*> RWORK is REAL array, dimension (MAX(7,LRWORK)) *> On exit, *> RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1) *> such that SCALE*SVA(1:N) are the computed singular values @@ -704,11 +704,11 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_CGEQP3 = REAL( CDUMMY(1) ) + LWRK_CGEQP3 = INT( CDUMMY(1) ) CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_CGEQRF = REAL( CDUMMY(1) ) + LWRK_CGEQRF = INT( CDUMMY(1) ) CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_CGELQF = REAL( CDUMMY(1) ) + LWRK_CGELQF = INT( CDUMMY(1) ) END IF MINWRK = 2 OPTWRK = 2 @@ -724,7 +724,7 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, $ N+LWRK_CGEQRF, LWRK_CGESVJ ) @@ -760,10 +760,10 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMLQ = REAL( CDUMMY(1) ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, $ N+LWRK_CGELQF, 2*N+LWRK_CGEQRF, @@ -799,10 +799,10 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF, $ LWRK_CGESVJ, LWRK_CUNMQRM ) @@ -861,26 +861,26 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQR = REAL( CDUMMY(1) ) + LWRK_CUNMQR = INT( CDUMMY(1) ) IF ( .NOT. JRACC ) THEN CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_CGEQP3N = REAL( CDUMMY(1) ) + LWRK_CGEQP3N = INT( CDUMMY(1) ) CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJ = REAL( CDUMMY(1) ) + LWRK_CGESVJ = INT( CDUMMY(1) ) CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJU = REAL( CDUMMY(1) ) + LWRK_CGESVJU = INT( CDUMMY(1) ) CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJV = REAL( CDUMMY(1) ) + LWRK_CGESVJV = INT( CDUMMY(1) ) CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMLQ = REAL( CDUMMY(1) ) + LWRK_CUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, $ 2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, @@ -909,13 +909,13 @@ SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, ELSE CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_CGESVJV = REAL( CDUMMY(1) ) + LWRK_CGESVJV = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_CUNMQR = REAL( CDUMMY(1) ) + LWRK_CUNMQR = INT( CDUMMY(1) ) CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_CUNMQRM = REAL( CDUMMY(1) ) + LWRK_CUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, $ 2*N+LWRK_CGEQRF, 2*N+N**2, diff --git a/lapack-netlib/SRC/cgelq.c b/lapack-netlib/SRC/cgelq.c index eefd0fa26d..0d9ca6399f 100644 --- a/lapack-netlib/SRC/cgelq.c +++ b/lapack-netlib/SRC/cgelq.c @@ -681,7 +681,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgelq_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgelq_(integer *m, integer *n, complex *a, integer *lda, complex *t, integer *tsize, complex *work, integer *lwork, integer * info) { @@ -694,11 +694,11 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cgelqt_(integer *, integer *, integer *, + extern /* Subroutine */ void cgelqt_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int claswlq_(integer *, integer *, integer *, + extern /* Subroutine */ void claswlq_(integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -839,15 +839,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("CGELQ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -861,7 +861,7 @@ static integer c__2 = 2; work[1].r = (real) lwreq, work[1].i = 0.f; - return 0; + return; /* End of CGELQ */ diff --git a/lapack-netlib/SRC/cgelq2.c b/lapack-netlib/SRC/cgelq2.c index 0b2c9b1446..5dae7577f7 100644 --- a/lapack-netlib/SRC/cgelq2.c +++ b/lapack-netlib/SRC/cgelq2.c @@ -639,7 +639,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgelq2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgelq2_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -648,11 +648,11 @@ f"> */ /* Local variables */ integer i__, k; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), clarfg_(integer *, complex *, complex *, integer *, complex *), - clacgv_(integer *, complex *, integer *), xerbla_(char *, integer - *, ftnlen); + clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.9.0) -- */ @@ -685,7 +685,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGELQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -721,7 +721,7 @@ f"> */ clacgv_(&i__2, &a[i__ + i__ * a_dim1], lda); /* L10: */ } - return 0; + return; /* End of CGELQ2 */ diff --git a/lapack-netlib/SRC/cgelqf.c b/lapack-netlib/SRC/cgelqf.c index 22fe3e5adb..3aa95c1779 100644 --- a/lapack-netlib/SRC/cgelqf.c +++ b/lapack-netlib/SRC/cgelqf.c @@ -659,7 +659,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgelqf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgelqf_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -667,15 +667,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int cgelq2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelq2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -720,9 +721,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGELQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -730,7 +731,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nbmin = 2; @@ -815,7 +816,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CGELQF */ diff --git a/lapack-netlib/SRC/cgelqt.c b/lapack-netlib/SRC/cgelqt.c index 969003d430..9f05cfa347 100644 --- a/lapack-netlib/SRC/cgelqt.c +++ b/lapack-netlib/SRC/cgelqt.c @@ -630,7 +630,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgelqt_(integer *m, integer *n, integer *mb, complex *a, +/* Subroutine */ void cgelqt_(integer *m, integer *n, integer *mb, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *info) { /* System generated locals */ @@ -638,10 +638,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen), - cgelqt3_(integer *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgelqt3_(integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -681,14 +682,14 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGELQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -717,7 +718,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ i__ * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of CGELQT */ diff --git a/lapack-netlib/SRC/cgelqt3.c b/lapack-netlib/SRC/cgelqt3.c index 5e667dcbec..0d7b3d0516 100644 --- a/lapack-netlib/SRC/cgelqt3.c +++ b/lapack-netlib/SRC/cgelqt3.c @@ -627,7 +627,7 @@ static complex c_b1 = {1.f,0.f}; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgelqt3_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgelqt3_(integer *m, integer *n, complex *a, integer * lda, complex *t, integer *ldt, integer *info) { /* System generated locals */ @@ -636,16 +636,17 @@ static complex c_b1 = {1.f,0.f}; /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer iinfo; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer i1, j1, m1, m2; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.8.0) -- */ @@ -679,7 +680,7 @@ static complex c_b1 = {1.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CGELQT3", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 1) { @@ -793,7 +794,7 @@ static complex c_b1 = {1.f,0.f}; } - return 0; + return; /* End of CGELQT3 */ diff --git a/lapack-netlib/SRC/cgels.c b/lapack-netlib/SRC/cgels.c index 0b2228b0de..6ac7240a88 100644 --- a/lapack-netlib/SRC/cgels.c +++ b/lapack-netlib/SRC/cgels.c @@ -697,7 +697,7 @@ static integer c__0 = 0; /* > \ingroup complexGEsolve */ /* ===================================================================== */ -/* Subroutine */ int cgels_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void cgels_(char *trans, integer *m, integer *n, integer * nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex * work, integer *lwork, integer *info) { @@ -714,24 +714,25 @@ static integer c__0 = 0; integer wsize; real rwork[1]; integer nb; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer mn; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_( char *, integer *, integer *, real *, real *, integer *, integer * , complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), claset_( char *, integer *, integer *, complex *, complex *, complex *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer scllen; real bignum; - extern /* Subroutine */ int cunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmlq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, @@ -840,9 +841,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("CGELS ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -852,7 +853,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -939,7 +940,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; @@ -954,7 +955,7 @@ static integer c__0 = 0; a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = ZERO */ @@ -1002,7 +1003,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1047,7 +1048,7 @@ static integer c__0 = 0; a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1077,7 +1078,7 @@ static integer c__0 = 0; r__1 = (real) wsize; work[1].r = r__1, work[1].i = 0.f; - return 0; + return; /* End of CGELS */ diff --git a/lapack-netlib/SRC/cgelsd.c b/lapack-netlib/SRC/cgelsd.c index a7368621b5..195e7b8d22 100644 --- a/lapack-netlib/SRC/cgelsd.c +++ b/lapack-netlib/SRC/cgelsd.c @@ -744,7 +744,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int cgelsd_(integer *m, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgelsd_(integer *m, integer *n, integer *nrhs, complex * a, integer *lda, complex *b, integer *ldb, real *s, real *rcond, integer *rank, complex *work, integer *lwork, real *rwork, integer * iwork, integer *info) @@ -757,13 +757,13 @@ f"> */ integer itau, nlvl, iascl, ibscl; real sfmin; integer minmn, maxmn, itaup, itauq, mnthr, nwork, ie, il; - extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *, + extern /* Subroutine */ void cgebrd_(integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *, integer *), slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer mm; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clalsd_( char *, integer *, integer *, integer *, real *, real *, complex * , integer *, real *, integer *, complex *, real *, integer *, @@ -771,13 +771,14 @@ f"> */ real *, integer *, integer *, complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), cunmbr_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), slaset_( @@ -785,7 +786,7 @@ f"> */ complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer ldwork; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer liwork, minwrk, maxwrk; @@ -994,16 +995,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGELSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters. */ @@ -1299,7 +1300,7 @@ f"> */ work[1].r = (real) maxwrk, work[1].i = 0.f; iwork[1] = liwork; rwork[1] = (real) lrwork; - return 0; + return; /* End of CGELSD */ diff --git a/lapack-netlib/SRC/cgelss.c b/lapack-netlib/SRC/cgelss.c index 683e3c8394..2fe469a91e 100644 --- a/lapack-netlib/SRC/cgelss.c +++ b/lapack-netlib/SRC/cgelss.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complexGEsolve */ /* ===================================================================== */ -/* Subroutine */ int cgelss_(integer *m, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgelss_(integer *m, integer *n, integer *nrhs, complex * a, integer *lda, complex *b, integer *ldb, real *s, real *rcond, integer *rank, complex *work, integer *lwork, real *rwork, integer * info) @@ -710,41 +710,43 @@ f"> */ integer itau, lwork_cgebrd__, lwork_cgelqf__, lwork_cgeqrf__, lwork_cungbr__, lwork_cunmbr__, i__, lwork_cunmlq__, lwork_cunmqr__; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer iascl, ibscl; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer chunk; real sfmin; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer minmn, maxmn, itaup, itauq, mnthr, iwork, bl, ie, il; - extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *, + extern /* Subroutine */ void cgebrd_(integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *, integer *), slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer mm; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_( char *, integer *, integer *, real *, real *, integer *, integer * , complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen), cbdsqr_(char *, + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cbdsqr_(char *, integer *, integer *, integer *, integer *, real *, real *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int cungbr_(char *, integer *, integer *, integer + extern /* Subroutine */ void cungbr_(char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), @@ -756,7 +758,7 @@ f"> */ char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer ldwork; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer minwrk, maxwrk; @@ -963,16 +965,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGELSS", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1431,7 +1433,7 @@ f"> */ } L70: work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGELSS */ diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f index 04defbb2e4..da6b9092f0 100644 --- a/lapack-netlib/SRC/cgelss.f +++ b/lapack-netlib/SRC/cgelss.f @@ -266,11 +266,11 @@ SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * * Compute space needed for CGEQRF CALL CGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_CGEQRF = REAL( DUM(1) ) + LWORK_CGEQRF = INT( DUM(1) ) * Compute space needed for CUNMQR CALL CUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_CUNMQR = REAL( DUM(1) ) + LWORK_CUNMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'CGEQRF', ' ', M, $ N, -1, -1 ) ) @@ -284,15 +284,15 @@ SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for CGEBRD CALL CGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1), $ -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 2*N + LWORK_CGEBRD ) MAXWRK = MAX( MAXWRK, 2*N + LWORK_CUNMBR ) @@ -310,23 +310,23 @@ SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for CGELQF CALL CGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_CGELQF = REAL( DUM(1) ) + LWORK_CGELQF = INT( DUM(1) ) * Compute space needed for CGEBRD CALL CGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) * Compute space needed for CUNMLQ CALL CUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMLQ = REAL( DUM(1) ) + LWORK_CUNMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_CGELQF MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_CGEBRD ) @@ -345,15 +345,15 @@ SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for CGEBRD CALL CGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_CGEBRD = REAL( DUM(1) ) + LWORK_CGEBRD = INT( DUM(1) ) * Compute space needed for CUNMBR CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_CUNMBR = REAL( DUM(1) ) + LWORK_CUNMBR = INT( DUM(1) ) * Compute space needed for CUNGBR CALL CUNGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_CUNGBR = REAL( DUM(1) ) + LWORK_CUNGBR = INT( DUM(1) ) MAXWRK = 2*M + LWORK_CGEBRD MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNMBR ) MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNGBR ) diff --git a/lapack-netlib/SRC/cgelst.c b/lapack-netlib/SRC/cgelst.c new file mode 100644 index 0000000000..21187e28dc --- /dev/null +++ b/lapack-netlib/SRC/cgelst.c @@ -0,0 +1,1109 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download CGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */ +/* > or LQ factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'C' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'C' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'C': the linear system involves A**H. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by CGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by CGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'C'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > modulus of elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of the modulus of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complexGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ void cgelst_(char *trans, integer *m, integer *n, integer * + nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex * + work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; + real r__1; + + /* Local variables */ + real anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + real rwork[1]; + integer lwopt, nb; + extern /* Subroutine */ void slabad_(real *, real *); + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer mn; + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + extern real slamch_(char *); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern /* Subroutine */ void cgelqt_(integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, integer *); + integer scllen; + real bignum; + extern /* Subroutine */ void cgeqrt_(integer *, integer *, integer *, + complex *, integer *, complex *, integer *, complex *, integer *); + integer mnnrhs; + real smlnum; + logical lquery; + extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + integer *, complex *, integer *, complex *, integer *, integer *); + extern void cgemlqt_(char *, char *, integer *, + integer *, integer *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *), cgemqrt_(char *, char *, integer *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "CGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGELST ", &i__1, 6); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + return; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "CGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = slamch_("S") / slamch_("P"); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = clange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + return; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = clange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + clascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + clascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + cgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + ctrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + ctrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0.f, b[i__3].i = 0.f; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + cgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + ctrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0.f, b[i__3].i = 0.f; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + cgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + ctrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + clascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + clascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + clascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + clascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + r__1 = (real) lwopt; + work[1].r = r__1, work[1].i = 0.f; + + return; + +/* End of CGELST */ + +} /* cgelst_ */ + diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f new file mode 100644 index 0000000000..7d8e44ddf2 --- /dev/null +++ b/lapack-netlib/SRC/cgelst.f @@ -0,0 +1,533 @@ +*> \brief CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR +*> or LQ factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'C' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'C' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'C': the linear system involves A**H. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by CGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by CGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'C'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> modulus of elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of the modulus of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + REAL ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + REAL RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE + EXTERNAL LSAME, ILAENV, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD, + $ CLASCL, CLASET, CTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'CGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = REAL( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'CGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL SLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = CLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = CLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL CLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL CLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL CGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL CTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL CTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit', + $ N, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL CGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL CTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL CGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL CTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit', + $ M, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL CLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = REAL( LWOPT ) +* + RETURN +* +* End of CGELST +* + END diff --git a/lapack-netlib/SRC/cgelsy.c b/lapack-netlib/SRC/cgelsy.c index 3f974deaf2..a9db55ea3e 100644 --- a/lapack-netlib/SRC/cgelsy.c +++ b/lapack-netlib/SRC/cgelsy.c @@ -727,7 +727,7 @@ f"> */ /* > G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgelsy_(integer *m, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgelsy_(integer *m, integer *n, integer *nrhs, complex * a, integer *lda, complex *b, integer *ldb, integer *jpvt, real *rcond, integer *rank, complex *work, integer *lwork, real *rwork, integer * info) @@ -740,45 +740,45 @@ f"> */ /* Local variables */ real anrm, bnrm, smin, smax; integer i__, j, iascl, ibscl; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer ismin, ismax; complex c1, c2; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), claic1_(integer *, integer *, complex *, real *, complex *, complex *, real *, complex *, complex *); real wsize; complex s1, s2; - extern /* Subroutine */ int cgeqp3_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqp3_(integer *, integer *, complex *, integer *, integer *, complex *, complex *, integer *, real *, integer *); integer nb; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer mn; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; integer nb1, nb2, nb3, nb4; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); real sminpr, smaxpr, smlnum; - extern /* Subroutine */ int cunmrz_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmrz_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int ctzrzf_(integer *, integer *, complex *, + extern /* Subroutine */ void ctzrzf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); @@ -854,9 +854,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGELSY", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -865,7 +865,7 @@ f"> */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*nrhs) == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1082,7 +1082,7 @@ f"> */ q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGELSY */ diff --git a/lapack-netlib/SRC/cgemlq.c b/lapack-netlib/SRC/cgemlq.c index a11fc8ce85..527ab2fedd 100644 --- a/lapack-netlib/SRC/cgemlq.c +++ b/lapack-netlib/SRC/cgemlq.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgemlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cgemlq_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *t, integer *tsize, complex *c__, integer *ldc, complex *work, integer *lwork, integer * info) @@ -684,7 +684,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int clamswlq_(char *, char *, integer *, integer * + extern /* Subroutine */ void clamswlq_(char *, char *, integer *, integer * , integer *, integer *, integer *, complex *, integer *, complex * , integer *, complex *, integer *, complex *, integer *, integer * ); @@ -693,7 +693,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int cgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemlqt_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); @@ -775,9 +775,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGEMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -785,7 +785,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -802,7 +802,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ r__1 = (real) lw; work[1].r = r__1, work[1].i = 0.f; - return 0; + return; /* End of CGEMLQ */ diff --git a/lapack-netlib/SRC/cgemlqt.c b/lapack-netlib/SRC/cgemlqt.c index 07caec98b7..ef6e1cfabe 100644 --- a/lapack-netlib/SRC/cgemlqt.c +++ b/lapack-netlib/SRC/cgemlqt.c @@ -658,7 +658,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgemlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cgemlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *mb, complex *v, integer *ldv, complex *t, integer *ldt, complex *c__, integer *ldc, complex *work, integer * info) @@ -673,9 +673,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; @@ -737,12 +738,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGEMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -803,7 +804,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of CGEMLQT */ diff --git a/lapack-netlib/SRC/cgemqr.c b/lapack-netlib/SRC/cgemqr.c index e9a0efaa84..0ec64b7965 100644 --- a/lapack-netlib/SRC/cgemqr.c +++ b/lapack-netlib/SRC/cgemqr.c @@ -675,7 +675,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgemqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cgemqr_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *t, integer *tsize, complex *c__, integer *ldc, complex *work, integer *lwork, integer * info) @@ -685,7 +685,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int clamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void clamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, complex *, integer *, complex * , integer *, complex *, integer *, complex *, integer *, integer * ); @@ -694,7 +694,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int cgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemqrt_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); @@ -775,9 +775,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGEMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -785,7 +785,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -801,7 +801,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1].r = (real) lw, work[1].i = 0.f; - return 0; + return; /* End of CGEMQR */ diff --git a/lapack-netlib/SRC/cgemqrt.c b/lapack-netlib/SRC/cgemqrt.c index 0771c85087..f374c4a642 100644 --- a/lapack-netlib/SRC/cgemqrt.c +++ b/lapack-netlib/SRC/cgemqrt.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgemqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cgemqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *nb, complex *v, integer *ldv, complex *t, integer *ldt, complex *c__, integer *ldc, complex *work, integer * info) @@ -691,9 +691,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; @@ -757,12 +758,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGEMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -823,7 +824,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of CGEMQRT */ diff --git a/lapack-netlib/SRC/cgeql2.c b/lapack-netlib/SRC/cgeql2.c index 6e8095ffe7..240c0dc956 100644 --- a/lapack-netlib/SRC/cgeql2.c +++ b/lapack-netlib/SRC/cgeql2.c @@ -637,7 +637,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeql2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeql2_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -647,10 +647,10 @@ f"> */ /* Local variables */ integer i__, k; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), - clarfg_(integer *, complex *, complex *, integer *, complex *), - xerbla_(char *, integer *, ftnlen); + clarfg_(integer *, complex *, complex *, integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -683,7 +683,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQL2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -712,7 +712,7 @@ f"> */ a[i__1].r = alpha.r, a[i__1].i = alpha.i; /* L10: */ } - return 0; + return; /* End of CGEQL2 */ diff --git a/lapack-netlib/SRC/cgeqlf.c b/lapack-netlib/SRC/cgeqlf.c index 7ca4164792..08a18a3353 100644 --- a/lapack-netlib/SRC/cgeqlf.c +++ b/lapack-netlib/SRC/cgeqlf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqlf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeqlf_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -662,15 +662,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int cgeql2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeql2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer mu, nu, nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -726,15 +727,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQLF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -826,7 +827,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CGEQLF */ diff --git a/lapack-netlib/SRC/cgeqp3.c b/lapack-netlib/SRC/cgeqp3.c index 0e93eae682..8376d54dce 100644 --- a/lapack-netlib/SRC/cgeqp3.c +++ b/lapack-netlib/SRC/cgeqp3.c @@ -674,7 +674,7 @@ f"> */ /* > X. Sun, Computer Science Dept., Duke University, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqp3_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeqp3_(integer *m, integer *n, complex *a, integer *lda, integer *jpvt, complex *tau, complex *work, integer *lwork, real * rwork, integer *info) { @@ -684,24 +684,24 @@ f"> */ /* Local variables */ integer nfxd, j, nbmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer minmn, minws; - extern /* Subroutine */ int claqp2_(integer *, integer *, integer *, + extern /* Subroutine */ void claqp2_(integer *, integer *, integer *, complex *, integer *, integer *, complex *, real *, real *, complex *); extern real scnrm2_(integer *, complex *, integer *); integer jb, na, nb, sm, sn, nx; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, - integer *, complex *, complex *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, + integer *, complex *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int claqps_(integer *, integer *, integer *, + extern /* Subroutine */ void claqps_(integer *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, real *, real *, complex *, complex *, integer *); integer topbmn, sminmn; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; @@ -763,9 +763,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQP3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Move initial columns up front. */ @@ -921,7 +921,7 @@ f"> */ q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGEQP3 */ diff --git a/lapack-netlib/SRC/cgeqr.c b/lapack-netlib/SRC/cgeqr.c index 6398096ecb..2389e358aa 100644 --- a/lapack-netlib/SRC/cgeqr.c +++ b/lapack-netlib/SRC/cgeqr.c @@ -683,7 +683,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqr_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeqr_(integer *m, integer *n, complex *a, integer *lda, complex *t, integer *tsize, complex *work, integer *lwork, integer * info) { @@ -696,11 +696,11 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cgeqrt_(integer *, integer *, integer *, + extern /* Subroutine */ void cgeqrt_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int clatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void clatsqr_(integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -829,15 +829,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("CGEQR", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -854,7 +854,7 @@ static integer c__2 = 2; i__1 = f2cmax(i__2,i__3); work[1].r = (real) i__1, work[1].i = 0.f; - return 0; + return; /* End of CGEQR */ diff --git a/lapack-netlib/SRC/cgeqr2.c b/lapack-netlib/SRC/cgeqr2.c index ffffeeeb0a..a3adcb7a86 100644 --- a/lapack-netlib/SRC/cgeqr2.c +++ b/lapack-netlib/SRC/cgeqr2.c @@ -644,7 +644,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqr2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeqr2_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -654,10 +654,10 @@ f"> */ /* Local variables */ integer i__, k; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), - clarfg_(integer *, complex *, complex *, integer *, complex *), - xerbla_(char *, integer *, ftnlen); + clarfg_(integer *, complex *, complex *, integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.9.0) -- */ @@ -690,7 +690,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQR2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -723,7 +723,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of CGEQR2 */ diff --git a/lapack-netlib/SRC/cgeqr2p.c b/lapack-netlib/SRC/cgeqr2p.c index 960ffc5900..cdefd0c9ba 100644 --- a/lapack-netlib/SRC/cgeqr2p.c +++ b/lapack-netlib/SRC/cgeqr2p.c @@ -648,7 +648,7 @@ l elements using an unblocked algorithm. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqr2p_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgeqr2p_(integer *m, integer *n, complex *a, integer * lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -658,9 +658,10 @@ l elements using an unblocked algorithm. */ /* Local variables */ integer i__, k; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * - , integer *, complex *, complex *, integer *, complex *), - xerbla_(char *, integer *, ftnlen), clarfgp_(integer *, complex *, + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * + , integer *, complex *, complex *, integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clarfgp_(integer *, complex *, complex *, integer *, complex *); @@ -694,7 +695,7 @@ l elements using an unblocked algorithm. */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQR2P", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -727,7 +728,7 @@ l elements using an unblocked algorithm. */ } /* L10: */ } - return 0; + return; /* End of CGEQR2P */ diff --git a/lapack-netlib/SRC/cgeqrf.c b/lapack-netlib/SRC/cgeqrf.c index 0fd9debac4..42aa20b317 100644 --- a/lapack-netlib/SRC/cgeqrf.c +++ b/lapack-netlib/SRC/cgeqrf.c @@ -661,7 +661,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqrf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgeqrf_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -669,15 +669,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqr2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -722,9 +723,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -732,7 +733,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nbmin = 2; @@ -817,7 +818,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CGEQRF */ diff --git a/lapack-netlib/SRC/cgeqrfp.c b/lapack-netlib/SRC/cgeqrfp.c index 5079824c84..3f577aa745 100644 --- a/lapack-netlib/SRC/cgeqrfp.c +++ b/lapack-netlib/SRC/cgeqrfp.c @@ -665,7 +665,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqrfp_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgeqrfp_(integer *m, integer *n, complex *a, integer * lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -673,17 +673,18 @@ static integer c__2 = 2; /* Local variables */ integer i__, k, nbmin, iinfo, ib, nb; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; logical lquery; - extern /* Subroutine */ int cgeqr2p_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqr2p_(integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer iws; @@ -725,9 +726,9 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("CGEQRFP", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -735,7 +736,7 @@ static integer c__2 = 2; k = f2cmin(*m,*n); if (k == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nbmin = 2; @@ -820,7 +821,7 @@ static integer c__2 = 2; } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CGEQRFP */ diff --git a/lapack-netlib/SRC/cgeqrt.c b/lapack-netlib/SRC/cgeqrt.c index 3fdeabfd0f..362ae322d2 100644 --- a/lapack-netlib/SRC/cgeqrt.c +++ b/lapack-netlib/SRC/cgeqrt.c @@ -650,7 +650,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqrt_(integer *m, integer *n, integer *nb, complex *a, +/* Subroutine */ void cgeqrt_(integer *m, integer *n, integer *nb, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *info) { /* System generated locals */ @@ -658,10 +658,11 @@ f"> */ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen), - cgeqrt2_(integer *, integer *, complex *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgeqrt2_(integer *, integer *, complex *, integer *, complex *, integer *, integer *), cgeqrt3_(integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -702,14 +703,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -744,7 +745,7 @@ f"> */ ib) * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of CGEQRT */ diff --git a/lapack-netlib/SRC/cgeqrt2.c b/lapack-netlib/SRC/cgeqrt2.c index 3a34ab3853..fb7ceb441e 100644 --- a/lapack-netlib/SRC/cgeqrt2.c +++ b/lapack-netlib/SRC/cgeqrt2.c @@ -643,7 +643,7 @@ presentation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqrt2_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgeqrt2_(integer *m, integer *n, complex *a, integer * lda, complex *t, integer *ldt, integer *info) { /* System generated locals */ @@ -652,14 +652,15 @@ presentation of Q. */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *), clarfg_(integer *, complex *, complex *, integer *, - complex *), xerbla_(char *, integer *, ftnlen); + complex *); + extern int xerbla_(char *, integer *, ftnlen); complex aii; @@ -696,7 +697,7 @@ presentation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQRT2", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -778,6 +779,6 @@ presentation of Q. */ /* End of CGEQRT2 */ - return 0; + return; } /* cgeqrt2_ */ diff --git a/lapack-netlib/SRC/cgeqrt3.c b/lapack-netlib/SRC/cgeqrt3.c index c2aea5dc53..8471a6f059 100644 --- a/lapack-netlib/SRC/cgeqrt3.c +++ b/lapack-netlib/SRC/cgeqrt3.c @@ -647,7 +647,7 @@ compact WY representation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgeqrt3_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgeqrt3_(integer *m, integer *n, complex *a, integer * lda, complex *t, integer *ldt, integer *info) { /* System generated locals */ @@ -656,16 +656,17 @@ compact WY representation of Q. */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer iinfo; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer i1, j1, n1, n2; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -699,7 +700,7 @@ compact WY representation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("CGEQRT3", &i__1, (ftnlen)7); - return 0; + return; } if (*n == 1) { @@ -809,7 +810,7 @@ compact WY representation of Q. */ } - return 0; + return; /* End of CGEQRT3 */ diff --git a/lapack-netlib/SRC/cgerfs.c b/lapack-netlib/SRC/cgerfs.c index a4ce88fade..91b60c677e 100644 --- a/lapack-netlib/SRC/cgerfs.c +++ b/lapack-netlib/SRC/cgerfs.c @@ -699,7 +699,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgerfs_(char *trans, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgerfs_(char *trans, integer *n, integer *nrhs, complex * a, integer *lda, complex *af, integer *ldaf, integer *ipiv, complex * b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -716,21 +716,22 @@ f"> */ integer i__, j, k; real s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgetrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int cgetrs_( char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); logical notran; @@ -790,7 +791,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGERFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -802,7 +803,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1028,7 +1029,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CGERFS */ diff --git a/lapack-netlib/SRC/cgerq2.c b/lapack-netlib/SRC/cgerq2.c index e3144ff139..05e6d6c9cd 100644 --- a/lapack-netlib/SRC/cgerq2.c +++ b/lapack-netlib/SRC/cgerq2.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgerq2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgerq2_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -642,11 +642,11 @@ f"> */ /* Local variables */ integer i__, k; complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), clarfg_(integer *, complex *, complex *, integer *, complex *), - clacgv_(integer *, complex *, integer *), xerbla_(char *, integer - *, ftnlen); + clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -679,7 +679,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGERQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -710,7 +710,7 @@ f"> */ clacgv_(&i__1, &a[*m - k + i__ + a_dim1], lda); /* L10: */ } - return 0; + return; /* End of CGERQ2 */ diff --git a/lapack-netlib/SRC/cgerqf.c b/lapack-netlib/SRC/cgerqf.c index 86a08e8db8..f13a5b245b 100644 --- a/lapack-netlib/SRC/cgerqf.c +++ b/lapack-netlib/SRC/cgerqf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgerqf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgerqf_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -662,15 +662,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int cgerq2_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerq2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer mu, nu, nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -726,15 +727,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGERQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -825,7 +826,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CGERQF */ diff --git a/lapack-netlib/SRC/cgesc2.c b/lapack-netlib/SRC/cgesc2.c index dacfda0d81..6bde5e427f 100644 --- a/lapack-netlib/SRC/cgesc2.c +++ b/lapack-netlib/SRC/cgesc2.c @@ -631,7 +631,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int cgesc2_(integer *n, complex *a, integer *lda, complex * +/* Subroutine */ void cgesc2_(integer *n, complex *a, integer *lda, complex * rhs, integer *ipiv, integer *jpiv, real *scale) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ /* Local variables */ complex temp; integer i__, j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), slabad_(real *, real *); extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); @@ -745,7 +745,7 @@ f"> */ i__1 = *n - 1; claswp_(&c__1, &rhs[1], lda, &c__1, &i__1, &jpiv[1], &c_n1); - return 0; + return; /* End of CGESC2 */ diff --git a/lapack-netlib/SRC/cgesdd.c b/lapack-netlib/SRC/cgesdd.c index ad9c84c17d..3ff545231e 100644 --- a/lapack-netlib/SRC/cgesdd.c +++ b/lapack-netlib/SRC/cgesdd.c @@ -742,7 +742,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgesdd_(char *jobz, integer *m, integer *n, complex *a, +/* Subroutine */ void cgesdd_(char *jobz, integer *m, integer *n, complex *a, integer *lda, real *s, complex *u, integer *ldu, complex *vt, integer *ldvt, complex *work, integer *lwork, real *rwork, integer *iwork, integer *info) @@ -760,26 +760,26 @@ f"> */ real anrm; integer ierr, itau, lwork_cunmbr_qln_mm__, lwork_cunmbr_qln_mn__, lwork_cunmbr_qln_nn__, idum[1], irvt, i__; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer chunk, minmn, wrkbl, itaup, itauq; logical wntqa; integer nwork; - extern /* Subroutine */ int clacp2_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clacp2_(char *, integer *, integer *, real *, integer *, complex *, integer *); logical wntqn, wntqo, wntqs; integer mnthr1, mnthr2, ie, lwork_cungbr_p_mn__, il, lwork_cungbr_p_nn__, lwork_cungbr_q_mn__, lwork_cungbr_q_mm__; - extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *, + extern /* Subroutine */ void cgebrd_(integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *, integer *); integer ir; extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer iu; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clacrm_( integer *, integer *, complex *, integer *, real *, integer *, complex *, integer *, real *), clarcm_(integer *, integer *, real @@ -790,13 +790,15 @@ f"> */ integer *, real *, integer *, real *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen), cungbr_(char *, + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cungbr_(char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), cunmbr_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunglq_( @@ -804,7 +806,7 @@ f"> */ complex *, integer *, integer *); extern logical sisnan_(real *); integer ldwrkl; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer ldwrkr, minwrk, ldwrku, maxwrk, ldwkvt; real smlnum; @@ -1267,15 +1269,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGESDD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1289,7 +1291,7 @@ f"> */ anrm = clange_("M", m, n, &a[a_offset], lda, dum); if (sisnan_(&anrm)) { *info = -4; - return 0; + return; } iscl = 0; if (anrm > 0.f && anrm < smlnum) { @@ -2951,7 +2953,7 @@ f"> */ work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGESDD */ diff --git a/lapack-netlib/SRC/cgesv.c b/lapack-netlib/SRC/cgesv.c index d3a277b0c0..f74542d125 100644 --- a/lapack-netlib/SRC/cgesv.c +++ b/lapack-netlib/SRC/cgesv.c @@ -639,8 +639,10 @@ iver) */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int cgetrf_(integer *, integer *, complex *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen), cgetrs_(char *, integer *, integer *, complex *, integer + extern /* Subroutine */ void cgetrf_(integer *, integer *, complex *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); diff --git a/lapack-netlib/SRC/cgesvd.c b/lapack-netlib/SRC/cgesvd.c index 752cc375b4..67a2c00305 100644 --- a/lapack-netlib/SRC/cgesvd.c +++ b/lapack-netlib/SRC/cgesvd.c @@ -732,7 +732,7 @@ f"> */ /* > \ingroup complexGEsing */ /* ===================================================================== */ -/* Subroutine */ int cgesvd_(char *jobu, char *jobvt, integer *m, integer *n, +/* Subroutine */ void cgesvd_(char *jobu, char *jobvt, integer *m, integer *n, complex *a, integer *lda, real *s, complex *u, integer *ldu, complex * vt, integer *ldvt, complex *work, integer *lwork, real *rwork, integer *info) @@ -749,40 +749,41 @@ f"> */ real anrm; integer ierr, itau, ncvt, nrvt, lwork_cgebrd__, lwork_cgelqf__, lwork_cgeqrf__, i__; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer chunk, minmn, wrkbl, itaup, itauq, mnthr, iwork; logical wntua, wntva, wntun, wntuo, wntvn, wntvo, wntus, wntvs; integer ie; - extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *, + extern /* Subroutine */ void cgebrd_(integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *, integer *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer ir, iu; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_( char *, integer *, integer *, real *, real *, integer *, integer * , complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), cbdsqr_(char *, integer *, integer *, integer *, integer *, real *, real *, complex *, integer *, complex *, integer *, - complex *, integer *, real *, integer *), xerbla_(char *, - integer *, ftnlen), cungbr_(char *, integer *, integer *, integer + complex *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cungbr_(char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cunmbr_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunglq_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cungqr_( @@ -1334,15 +1335,15 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("CGESVD", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -4718,7 +4719,7 @@ f"> */ work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGESVD */ diff --git a/lapack-netlib/SRC/cgesvdq.c b/lapack-netlib/SRC/cgesvdq.c index d5cf58f823..77f1b45f30 100644 --- a/lapack-netlib/SRC/cgesvdq.c +++ b/lapack-netlib/SRC/cgesvdq.c @@ -932,7 +932,7 @@ static logical c_false = FALSE_; /* > \ingroup complexGEsing */ /* ===================================================================== */ -/* Subroutine */ int cgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, +/* Subroutine */ void cgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, char *jobv, integer *m, integer *n, complex *a, integer *lda, real *s, complex *u, integer *ldu, complex *v, integer *ldv, integer *numrank, integer *iwork, integer *liwork, complex *cwork, integer *lcwork, @@ -965,7 +965,7 @@ static logical c_false = FALSE_; logical dntwu, dntwv, wntuf, wntva; integer lwunq; logical wntur, wntus, wntvr; - extern /* Subroutine */ int cgeqp3_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqp3_(integer *, integer *, complex *, integer *, integer *, complex *, complex *, integer *, real *, integer *); extern real scnrm2_(integer *, complex *, integer *); @@ -973,21 +973,22 @@ static logical c_false = FALSE_; extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer nr; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_( char *, integer *, integer *, real *, real *, integer *, integer * , complex *, integer *, integer *); real sconda; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), csscal_( integer *, real *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int cgesvd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgesvd_(char *, char *, integer *, integer *, complex *, integer *, real *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, - integer *, complex *, complex *, complex *, integer *), - xerbla_(char *, integer *, ftnlen), clapmt_(logical *, integer *, + integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clapmt_(logical *, integer *, integer *, complex *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real * , integer *, integer *), cpocon_(char *, integer *, @@ -995,10 +996,11 @@ static logical c_false = FALSE_; ); extern integer isamax_(integer *, real *, integer *); extern /* Subroutine */ int claswp_(integer *, complex *, integer *, - integer *, integer *, integer *, integer *), slaset_(char *, + integer *, integer *, integer *, integer *); + extern void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); complex cdummy[1]; - extern /* Subroutine */ int cunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmlq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, @@ -1380,7 +1382,7 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); xerbla_("CGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { /* Return optimal workspace */ @@ -1389,13 +1391,13 @@ static logical c_false = FALSE_; cwork[1].r = (real) optwrk, cwork[1].i = 0.f; cwork[2].r = (real) minwrk, cwork[2].i = 0.f; rwork[1] = (real) rminwrk; - return 0; + return; } /* Quick return if the matrix is void. */ if (*m == 0 || *n == 0) { - return 0; + return; } big = slamch_("O"); @@ -1412,7 +1414,7 @@ static logical c_false = FALSE_; *info = -8; i__2 = -(*info); xerbla_("CGESVDQ", &i__2, (ftnlen)7); - return 0; + return; } /* L1904: */ } @@ -1466,7 +1468,7 @@ static logical c_false = FALSE_; rwork[1] = -1.f; } rwork[2] = -1.f; - return 0; + return; } if (rwork[1] > big / sqrt((real) (*m))) { @@ -1490,7 +1492,7 @@ static logical c_false = FALSE_; *info = -8; i__1 = -(*info); xerbla_("CGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } if (rtmp > big / sqrt((real) (*m))) { /* matrix by 1/sqrt(M) if too large entry detected */ @@ -2317,7 +2319,7 @@ static logical c_false = FALSE_; /* full row rank triangular (trapezoidal) factor of A. */ *numrank = nr; - return 0; + return; /* End of CGESVDQ */ diff --git a/lapack-netlib/SRC/cgesvdx.c b/lapack-netlib/SRC/cgesvdx.c index 7df4cf2468..0274709a3f 100644 --- a/lapack-netlib/SRC/cgesvdx.c +++ b/lapack-netlib/SRC/cgesvdx.c @@ -786,7 +786,7 @@ static integer c_n1 = -1; /* > \ingroup complexGEsing */ /* ===================================================================== */ -/* Subroutine */ int cgesvdx_(char *jobu, char *jobvt, char *range, integer * +/* Subroutine */ void cgesvdx_(char *jobu, char *jobvt, char *range, integer * m, integer *n, complex *a, integer *lda, real *vl, real *vu, integer * il, integer *iu, integer *ns, real *s, complex *u, integer *ldu, complex *vt, integer *ldvt, complex *work, integer *lwork, real * @@ -813,43 +813,44 @@ static integer c_n1 = -1; integer iltgk, itemp, minmn, itaup, itauq, iutgk, itgkz, mnthr; logical wantu; integer id, ie; - extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *, + extern /* Subroutine */ void cgebrd_(integer *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, integer *, integer *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clascl_( char *, integer *, integer *, real *, real *, integer *, integer * , complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clacpy_(char *, - integer *, integer *, complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); real abstol; - extern /* Subroutine */ int cunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cunmbr_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); char rngtgk[1]; - extern /* Subroutine */ int cunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmlq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer itempr; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer minwrk, maxwrk; real smlnum; logical lquery, wantvt; real dum[1], eps; - extern /* Subroutine */ int sbdsvdx_(char *, char *, char *, integer *, + extern /* Subroutine */ void sbdsvdx_(char *, char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -1049,15 +1050,15 @@ static integer c_n1 = -1; if (*info != 0) { i__2 = -(*info); xerbla_("CGESVDX", &i__2, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set singular values indices accord to RANGE='A'. */ @@ -1501,7 +1502,7 @@ static integer c_n1 = -1; q__1.r = r__1, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGESVDX */ diff --git a/lapack-netlib/SRC/cgesvj.c b/lapack-netlib/SRC/cgesvj.c index 53b1132814..82837435b1 100644 --- a/lapack-netlib/SRC/cgesvj.c +++ b/lapack-netlib/SRC/cgesvj.c @@ -868,7 +868,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgesvj_(char *joba, char *jobu, char *jobv, integer *m, +/* Subroutine */ void cgesvj_(char *joba, char *jobu, char *jobv, integer *m, integer *n, complex *a, integer *lda, real *sva, integer *mv, complex *v, integer *ldv, complex *cwork, integer *lwork, real *rwork, integer *lrwork, integer *info) @@ -885,7 +885,7 @@ f"> */ real aaqq, ctol; integer ierr; real bigtheta; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); complex ompq; integer pskipped; @@ -898,16 +898,16 @@ f"> */ extern logical lsame_(char *, char *); real theta, small, sfmin; logical lsvec; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); real epsln; logical applv, rsvec, uctol; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical lower, upper, rotok; integer n2, n4; - extern /* Subroutine */ int cgsvj0_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgsvj0_(char *, integer *, integer *, complex *, integer *, complex *, real *, integer *, complex *, integer *, real *, real *, real *, integer *, complex *, integer *, integer * ), cgsvj1_(char *, integer *, integer *, integer *, @@ -918,18 +918,19 @@ f"> */ extern real scnrm2_(integer *, complex *, integer *); integer n34; real cs, sn; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, - complex *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer blskip; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); real mxaapq, thsign, mxsinj; integer ir1, emptsw; @@ -1014,18 +1015,18 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGESVJ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { i__1 = *m + *n; cwork[1].r = (real) i__1, cwork[1].i = 0.f; rwork[1] = (real) f2cmax(*n,6); - return 0; + return; } /* #:) Quick return for void matrix */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set numerical parameters */ @@ -1067,7 +1068,7 @@ f"> */ *info = -4; i__1 = -(*info); xerbla_("CGESVJ", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize the right singular vector matrix. */ @@ -1105,7 +1106,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("CGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1135,7 +1136,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("CGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1165,7 +1166,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("CGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1221,7 +1222,7 @@ f"> */ rwork[4] = 0.f; rwork[5] = 0.f; rwork[6] = 0.f; - return 0; + return; } /* #:) Quick return for one-column matrix */ @@ -1241,7 +1242,7 @@ f"> */ rwork[4] = 0.f; rwork[5] = 0.f; rwork[6] = 0.f; - return 0; + return; } /* Protect small singular values from underflow, and try to */ @@ -2267,6 +2268,6 @@ f"> */ /* MXSINJ is the largest absolute value of the sines of Jacobi angles */ /* in the last sweep */ - return 0; + return; } /* cgesvj_ */ diff --git a/lapack-netlib/SRC/cgesvx.c b/lapack-netlib/SRC/cgesvx.c index b39803abb5..cc7485672c 100644 --- a/lapack-netlib/SRC/cgesvx.c +++ b/lapack-netlib/SRC/cgesvx.c @@ -857,7 +857,7 @@ f"> */ /* > \ingroup complexGEsolve */ /* ===================================================================== */ -/* Subroutine */ int cgesvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void cgesvx_(char *fact, char *trans, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer * ipiv, char *equed, real *r__, real *c__, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, @@ -878,22 +878,24 @@ f"> */ logical equil; extern real clange_(char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int claqge_(integer *, integer *, complex *, + extern /* Subroutine */ void claqge_(integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, char *) , cgecon_(char *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *); real colcnd; extern real slamch_(char *); - extern /* Subroutine */ int cgeequ_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeequ_(integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, integer *); logical nofact; - extern /* Subroutine */ int cgerfs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgerfs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, - integer *), cgetrf_(integer *, integer *, complex *, - integer *, integer *, integer *), clacpy_(char *, integer *, - integer *, complex *, integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + integer *); + extern int cgetrf_(integer *, integer *, complex *, + integer *, integer *, integer *); + extern void clacpy_(char *, integer *, + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; extern real clantr_(char *, char *, char *, integer *, integer *, complex *, integer *, real *); @@ -1029,7 +1031,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGESVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1110,7 +1112,7 @@ f"> */ } rwork[1] = rpvgrw; *rcond = 0.f; - return 0; + return; } } @@ -1203,7 +1205,7 @@ f"> */ } rwork[1] = rpvgrw; - return 0; + return; /* End of CGESVX */ diff --git a/lapack-netlib/SRC/cgesvxx.c b/lapack-netlib/SRC/cgesvxx.c index ced9e0b2cf..c17d8f5991 100644 --- a/lapack-netlib/SRC/cgesvxx.c +++ b/lapack-netlib/SRC/cgesvxx.c @@ -1048,7 +1048,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexGEsolve */ /* ===================================================================== */ -/* Subroutine */ int cgesvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void cgesvxx_(char *fact, char *trans, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer * ipiv, char *equed, real *r__, real *c__, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, @@ -1070,26 +1070,26 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); real rcmin, rcmax; logical equil; - extern /* Subroutine */ int claqge_(integer *, integer *, complex *, + extern /* Subroutine */ void claqge_(integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, char *) ; real colcnd; extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int cgetrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgetrf_(integer *, integer *, complex *, integer *, integer *, integer *), clacpy_(char *, integer *, - integer *, complex *, integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; logical colequ; - extern /* Subroutine */ int cgetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real rowcnd; logical notran; real smlnum; logical rowequ; - extern /* Subroutine */ int clascl2_(integer *, integer *, real *, + extern /* Subroutine */ void clascl2_(integer *, integer *, real *, complex *, integer *), cgeequb_(integer *, integer *, complex *, integer *, real *, real *, real *, real *, real *, integer *), cgerfsx_(char *, char *, integer *, integer *, complex *, integer @@ -1231,7 +1231,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGESVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1297,7 +1297,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = cla_gerpvgrw_(n, info, &a[a_offset], lda, &af[ af_offset], ldaf); - return 0; + return; } } @@ -1328,7 +1328,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ clascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of CGESVXX */ diff --git a/lapack-netlib/SRC/cgetc2.c b/lapack-netlib/SRC/cgetc2.c index 8c1d12d835..46362ce927 100644 --- a/lapack-netlib/SRC/cgetc2.c +++ b/lapack-netlib/SRC/cgetc2.c @@ -625,7 +625,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int cgetc2_(integer *n, complex *a, integer *lda, integer * +/* Subroutine */ void cgetc2_(integer *n, complex *a, integer *lda, integer * ipiv, integer *jpiv, integer *info) { /* System generated locals */ @@ -636,7 +636,7 @@ f"> */ /* Local variables */ real smin, xmax; integer i__, j; - extern /* Subroutine */ int cgeru_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *), slabad_(real *, real *); @@ -668,7 +668,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -689,7 +689,7 @@ f"> */ q__1.r = smlnum, q__1.i = 0.f; a[i__1].r = q__1.r, a[i__1].i = q__1.i; } - return 0; + return; } /* Factorize A using complete pivoting. */ @@ -770,7 +770,7 @@ f"> */ ipiv[*n] = *n; jpiv[*n] = *n; - return 0; + return; /* End of CGETC2 */ diff --git a/lapack-netlib/SRC/cgetf2.c b/lapack-netlib/SRC/cgetf2.c index 61b27239cb..e7b8f558c7 100644 --- a/lapack-netlib/SRC/cgetf2.c +++ b/lapack-netlib/SRC/cgetf2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgetf2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgetf2_(integer *m, integer *n, complex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -632,11 +632,11 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); real sfmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer jp; extern integer icamax_(integer *, complex *, integer *); @@ -673,13 +673,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGETF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Compute machine safe minimum */ @@ -740,7 +740,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of CGETF2 */ diff --git a/lapack-netlib/SRC/cgetrf.c b/lapack-netlib/SRC/cgetrf.c index 485f9d848a..a5fc71f1d0 100644 --- a/lapack-netlib/SRC/cgetrf.c +++ b/lapack-netlib/SRC/cgetrf.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgetrf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void cgetrf_(integer *m, integer *n, complex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -632,18 +632,18 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer iinfo; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer jb, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int claswp_(integer *, complex *, integer *, + extern /* Subroutine */ void claswp_(integer *, complex *, integer *, integer *, integer *, integer *, integer *), cgetrf2_(integer *, integer *, complex *, integer *, integer *, integer *); @@ -677,13 +677,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGETRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -762,7 +762,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of CGETRF */ diff --git a/lapack-netlib/SRC/cgetrf2.c b/lapack-netlib/SRC/cgetrf2.c index 88eff2b449..4d0a8ecc1f 100644 --- a/lapack-netlib/SRC/cgetrf2.c +++ b/lapack-netlib/SRC/cgetrf2.c @@ -624,7 +624,7 @@ static integer c__1 = 1; /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgetrf2_(integer *m, integer *n, complex *a, integer * +/* Subroutine */ void cgetrf2_(integer *m, integer *n, complex *a, integer * lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -634,19 +634,20 @@ static integer c__1 = 1; /* Local variables */ complex temp; integer i__; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); integer iinfo; real sfmin; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer n1, n2; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), claswp_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int claswp_( integer *, complex *, integer *, integer *, integer *, integer *, integer *); @@ -680,13 +681,13 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CGETRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (*m == 1) { @@ -807,7 +808,7 @@ static integer c__1 = 1; claswp_(&n1, &a[a_dim1 + 1], lda, &i__1, &i__2, &ipiv[1], &c__1); } - return 0; + return; /* End of CGETRF2 */ diff --git a/lapack-netlib/SRC/cgetri.c b/lapack-netlib/SRC/cgetri.c index 803aa666ef..aed0fbd54c 100644 --- a/lapack-netlib/SRC/cgetri.c +++ b/lapack-netlib/SRC/cgetri.c @@ -630,7 +630,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgetri_(integer *n, complex *a, integer *lda, integer * +/* Subroutine */ void cgetri_(integer *n, complex *a, integer *lda, integer * ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -639,13 +639,13 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer nbmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); @@ -696,15 +696,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGETRI", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form inv(U). If INFO > 0 from CTRTRI, then U is singular, */ @@ -712,7 +712,7 @@ f"> */ ctrtri_("Upper", "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } nbmin = 2; @@ -817,7 +817,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CGETRI */ diff --git a/lapack-netlib/SRC/cgetrs.c b/lapack-netlib/SRC/cgetrs.c index dfdd204a82..3d895187d3 100644 --- a/lapack-netlib/SRC/cgetrs.c +++ b/lapack-netlib/SRC/cgetrs.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgetrs_(char *trans, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgetrs_(char *trans, integer *n, integer *nrhs, complex * a, integer *lda, integer *ipiv, complex *b, integer *ldb, integer * info) { @@ -645,10 +645,11 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, - integer *), xerbla_(char *, - integer *, ftnlen), claswp_(integer *, complex *, integer *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void claswp_(integer *, complex *, integer *, integer *, integer *, integer *, integer *); logical notran; @@ -691,13 +692,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGETRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (notran) { @@ -736,7 +737,7 @@ f"> */ claswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1); } - return 0; + return; /* End of CGETRS */ diff --git a/lapack-netlib/SRC/cgetsls.c b/lapack-netlib/SRC/cgetsls.c index 6f550a9f18..b2a87f4f0b 100644 --- a/lapack-netlib/SRC/cgetsls.c +++ b/lapack-netlib/SRC/cgetsls.c @@ -674,7 +674,7 @@ static integer c__0 = 0; /* > \ingroup complexGEsolve */ /* ===================================================================== */ -/* Subroutine */ int cgetsls_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void cgetsls_(char *trans, integer *m, integer *n, integer * nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex * work, integer *lwork, integer *info) { @@ -686,26 +686,27 @@ static integer c__0 = 0; real anrm, bnrm; logical tran; integer brow, tszm, tszo, info2, i__, j, iascl, ibscl; - extern /* Subroutine */ int cgelq_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelq_(integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgeqr_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqr_(integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer minmn, maxmn; complex workq[1]; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); complex tq[5]; extern real slamch_(char *); - extern /* Subroutine */ int cgemlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemlq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *), claset_(char *, integer *, integer *, complex *, complex *, - complex *, integer *), xerbla_(char *, integer *, ftnlen), - cgemqr_(char *, char *, integer *, integer *, integer *, complex + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgemqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer scllen; @@ -821,7 +822,7 @@ static integer c__0 = 0; xerbla_("CGETSLS", &i__1, (ftnlen)7); r__1 = (real) wsizeo; work[1].r = r__1, work[1].i = 0.f; - return 0; + return; } if (lquery) { if (*lwork == -1) { @@ -832,7 +833,7 @@ static integer c__0 = 0; r__1 = (real) wsizem; work[1].r = r__1, work[1].i = 0.f; } - return 0; + return; } if (*lwork < wsizeo) { lw1 = tszm; @@ -849,7 +850,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); claset_("FULL", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -927,7 +928,7 @@ static integer c__0 = 0; ctrtrs_("U", "N", "N", n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; } else { @@ -940,7 +941,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = CZERO */ @@ -984,7 +985,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1026,7 +1027,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1055,7 +1056,7 @@ static integer c__0 = 0; L50: r__1 = (real) (tszo + lwo); work[1].r = r__1, work[1].i = 0.f; - return 0; + return; /* End of ZGETSLS */ diff --git a/lapack-netlib/SRC/cgetsqrhrt.c b/lapack-netlib/SRC/cgetsqrhrt.c index 23b525ae63..922a911c39 100644 --- a/lapack-netlib/SRC/cgetsqrhrt.c +++ b/lapack-netlib/SRC/cgetsqrhrt.c @@ -689,7 +689,7 @@ hrt.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgetsqrhrt_(integer *m, integer *n, integer *mb1, +/* Subroutine */ void cgetsqrhrt_(integer *m, integer *n, integer *mb1, integer *nb1, integer *nb2, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *lwork, integer *info) { @@ -700,17 +700,18 @@ hrt.f"> */ /* Local variables */ integer ldwt, lworkopt, i__, j; - extern /* Subroutine */ int cungtsqr_row_(integer *, integer *, integer * + extern /* Subroutine */ void cungtsqr_row_(integer *, integer *, integer * , integer *, complex *, integer *, complex *, integer *, complex * , integer *, integer *); integer iinfo; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cunhr_col_(integer *, integer *, integer * , complex *, integer *, complex *, integer *, complex *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; integer lw1, lw2, num_all_row_blocks__, lwt; - extern /* Subroutine */ int clatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void clatsqr_(integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer nb1local, nb2local; @@ -812,11 +813,11 @@ hrt.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGETSQRHRT", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* Quick return if possible */ @@ -824,7 +825,7 @@ hrt.f"> */ if (f2cmin(*m,*n) == 0) { q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } nb2local = f2cmin(*nb2,*n); @@ -895,7 +896,7 @@ hrt.f"> */ q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGETSQRHRT */ diff --git a/lapack-netlib/SRC/cggbak.c b/lapack-netlib/SRC/cggbak.c index aca1d31a8b..d55caff2e4 100644 --- a/lapack-netlib/SRC/cggbak.c +++ b/lapack-netlib/SRC/cggbak.c @@ -656,7 +656,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggbak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void cggbak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, real *lscale, real *rscale, integer *m, complex *v, integer *ldv, integer *info) { @@ -666,11 +666,12 @@ f"> */ /* Local variables */ integer i__, k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical leftv; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); logical rightv; @@ -720,19 +721,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -837,7 +838,7 @@ f"> */ L110: - return 0; + return; /* End of CGGBAK */ diff --git a/lapack-netlib/SRC/cggbak.f b/lapack-netlib/SRC/cggbak.f index e8ac348050..1594496017 100644 --- a/lapack-netlib/SRC/cggbak.f +++ b/lapack-netlib/SRC/cggbak.f @@ -253,7 +253,7 @@ SUBROUTINE CGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( ILO.EQ.1 ) $ GO TO 50 DO 40 I = ILO - 1, 1, -1 - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -263,7 +263,7 @@ SUBROUTINE CGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( IHI.EQ.N ) $ GO TO 70 DO 60 I = IHI + 1, N - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 60 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -277,7 +277,7 @@ SUBROUTINE CGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( ILO.EQ.1 ) $ GO TO 90 DO 80 I = ILO - 1, 1, -1 - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 80 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -287,7 +287,7 @@ SUBROUTINE CGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( IHI.EQ.N ) $ GO TO 110 DO 100 I = IHI + 1, N - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 100 CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/cggbal.c b/lapack-netlib/SRC/cggbal.c index 559244c228..b68f6c3625 100644 --- a/lapack-netlib/SRC/cggbal.c +++ b/lapack-netlib/SRC/cggbal.c @@ -691,7 +691,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggbal_(char *job, integer *n, complex *a, integer *lda, +/* Subroutine */ void cggbal_(char *job, integer *n, complex *a, integer *lda, complex *b, integer *ldb, integer *ilo, integer *ihi, real *lscale, real *rscale, real *work, integer *info) { @@ -709,13 +709,13 @@ f"> */ integer i__, j, k, l, m; real gamma, t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real sfmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); real sfmax; integer iflow, kount; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); integer jc; real ta, tb, tc; @@ -725,8 +725,9 @@ f"> */ real pgamma; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); integer lsfmin, lsfmax, ip1, jp1, lm1; real cab, rab, ewc, cor, sum; integer nrp2, icab; @@ -769,7 +770,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGBAL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -777,7 +778,7 @@ f"> */ if (*n == 0) { *ilo = 1; *ihi = *n; - return 0; + return; } if (*n == 1) { @@ -785,7 +786,7 @@ f"> */ *ihi = *n; lscale[1] = 1.f; rscale[1] = 1.f; - return 0; + return; } if (lsame_(job, "N")) { @@ -797,7 +798,7 @@ f"> */ rscale[i__] = 1.f; /* L10: */ } - return 0; + return; } k = 1; @@ -942,11 +943,11 @@ f"> */ rscale[i__] = 1.f; /* L195: */ } - return 0; + return; } if (*ilo == *ihi) { - return 0; + return; } /* Balance the submatrix in rows ILO to IHI. */ @@ -1209,7 +1210,7 @@ f"> */ /* L380: */ } - return 0; + return; /* End of CGGBAL */ diff --git a/lapack-netlib/SRC/cggbal.f b/lapack-netlib/SRC/cggbal.f index c7a2324157..66ba7a8818 100644 --- a/lapack-netlib/SRC/cggbal.f +++ b/lapack-netlib/SRC/cggbal.f @@ -535,7 +535,7 @@ SUBROUTINE CGGBAL( JOB, N, A, LDA, B, LDB, ILO, IHI, LSCALE, IRAB = ICAMAX( N-ILO+1, B( I, ILO ), LDB ) RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) ) LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE ) - IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) + IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) ) IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB ) LSCALE( I ) = SCLFAC**IR ICAB = ICAMAX( IHI, A( 1, I ), 1 ) @@ -543,7 +543,7 @@ SUBROUTINE CGGBAL( JOB, N, A, LDA, B, LDB, ILO, IHI, LSCALE, ICAB = ICAMAX( IHI, B( 1, I ), 1 ) CAB = MAX( CAB, ABS( B( ICAB, I ) ) ) LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE ) - JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) + JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) ) JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB ) RSCALE( I ) = SCLFAC**JC 360 CONTINUE diff --git a/lapack-netlib/SRC/cgges.c b/lapack-netlib/SRC/cgges.c index 03dbf28e60..1c1db9a87d 100644 --- a/lapack-netlib/SRC/cgges.c +++ b/lapack-netlib/SRC/cgges.c @@ -784,7 +784,7 @@ or GE matrices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgges_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, complex *a, integer *lda, complex *b, integer * ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl, integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer * @@ -803,29 +803,30 @@ or GE matrices */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irwrk, irows; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, real *, real *, integer *), slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *), @@ -839,7 +840,7 @@ or GE matrices */ integer lwkmin; logical lastsl; real bnrmto; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer @@ -964,16 +965,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGES ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1185,7 +1186,7 @@ or GE matrices */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CGGES */ diff --git a/lapack-netlib/SRC/cgges3.c b/lapack-netlib/SRC/cgges3.c index 046f7946b9..e7b2e877ec 100644 --- a/lapack-netlib/SRC/cgges3.c +++ b/lapack-netlib/SRC/cgges3.c @@ -783,7 +783,7 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, complex *a, integer *lda, complex *b, integer * ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl, integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer * @@ -803,28 +803,29 @@ f"> */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irwrk; - extern /* Subroutine */ int cgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghd3_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer irows; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, real *, real *, integer *), slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *), @@ -836,7 +837,7 @@ f"> */ integer ijobvl, iright, ijobvr; real anrmto, bnrmto; logical lastsl; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer @@ -979,16 +980,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGES3 ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1194,7 +1195,7 @@ f"> */ q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGGES3 */ diff --git a/lapack-netlib/SRC/cggesx.c b/lapack-netlib/SRC/cggesx.c index 06102e1401..d33dad3391 100644 --- a/lapack-netlib/SRC/cggesx.c +++ b/lapack-netlib/SRC/cggesx.c @@ -843,7 +843,7 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, char *sense, integer *n, complex *a, integer *lda, complex *b, integer *ldb, integer *sdim, complex *alpha, complex *beta, complex * vsl, integer *ldvsl, complex *vsr, integer *ldvsr, real *rconde, real @@ -862,7 +862,7 @@ f"> */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irwrk, irows; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, @@ -870,24 +870,24 @@ f"> */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); real pl; - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); real pr; logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern real slamch_(char *); real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *), @@ -901,12 +901,12 @@ f"> */ integer liwmin; logical wantse, lastsl; real anrmto, bnrmto; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer minwrk, maxwrk; logical wantsn; real smlnum; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); logical wantst, lquery, wantsv; @@ -1062,16 +1062,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1309,7 +1309,7 @@ f"> */ work[1].r = (real) maxwrk, work[1].i = 0.f; iwork[1] = liwmin; - return 0; + return; /* End of CGGESX */ diff --git a/lapack-netlib/SRC/cggev.c b/lapack-netlib/SRC/cggev.c index 6ef818dbb6..bb7b0b4f64 100644 --- a/lapack-netlib/SRC/cggev.c +++ b/lapack-netlib/SRC/cggev.c @@ -734,7 +734,7 @@ ices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cggev_(char *jobvl, char *jobvr, integer *n, complex *a, +/* Subroutine */ void cggev_(char *jobvl, char *jobvr, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *alpha, complex *beta, complex *vl, integer *ldvl, complex *vr, integer *ldvr, complex * work, integer *lwork, real *rwork, integer *info) @@ -753,7 +753,7 @@ ices */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irwrk, irows, jc; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, @@ -762,25 +762,25 @@ ices */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer jr; - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), ctgevc_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, integer - *, complex *, real *, integer *), xerbla_(char *, - integer *, ftnlen); + *, complex *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical ldumma[1]; char chtemp[1]; real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *); @@ -788,12 +788,12 @@ ices */ integer *, integer *, ftnlen, ftnlen); extern real slamch_(char *); integer ijobvl, iright, ijobvr; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real anrmto; integer lwkmin; real bnrmto; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); real smlnum; @@ -913,15 +913,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1164,7 +1164,7 @@ ices */ } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CGGEV */ diff --git a/lapack-netlib/SRC/cggev3.c b/lapack-netlib/SRC/cggev3.c index a00f1a18cc..511635d3d4 100644 --- a/lapack-netlib/SRC/cggev3.c +++ b/lapack-netlib/SRC/cggev3.c @@ -733,7 +733,7 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ int cggev3_(char *jobvl, char *jobvr, integer *n, complex *a, +/* Subroutine */ void cggev3_(char *jobvl, char *jobvr, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *alpha, complex * beta, complex *vl, integer *ldvl, complex *vr, integer *ldvr, complex *work, integer *lwork, real *rwork, integer *info) @@ -752,11 +752,11 @@ f"> */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irwrk; - extern /* Subroutine */ int cgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghd3_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer irows, jc; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, @@ -765,31 +765,31 @@ f"> */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer jr; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), ctgevc_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, integer - *, complex *, real *, integer *), xerbla_(char *, - integer *, ftnlen); + *, complex *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical ldumma[1]; char chtemp[1]; real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *); extern real slamch_(char *); integer ijobvl, iright, ijobvr; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real anrmto, bnrmto; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); real smlnum; @@ -932,15 +932,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGEV3 ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1178,7 +1178,7 @@ f"> */ q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGGEV3 */ diff --git a/lapack-netlib/SRC/cggevx.c b/lapack-netlib/SRC/cggevx.c index da6bf97346..4b72816d8c 100644 --- a/lapack-netlib/SRC/cggevx.c +++ b/lapack-netlib/SRC/cggevx.c @@ -888,7 +888,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void cggevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *alpha, complex *beta, complex *vl, integer *ldvl, complex * vr, integer *ldvr, integer *ilo, integer *ihi, real *lscale, real * @@ -912,7 +912,7 @@ f"> */ integer icols; logical noscl; integer irows, jc; - extern /* Subroutine */ int cggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, complex *, integer *, integer *), cggbal_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, real *, @@ -921,13 +921,13 @@ f"> */ extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer jr; - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *), clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex @@ -935,7 +935,7 @@ f"> */ logical ldumma[1]; char chtemp[1]; real bignum; - extern /* Subroutine */ int chgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void chgeqz_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *), @@ -943,24 +943,24 @@ f"> */ , complex *, integer *, complex *, integer *, complex *, integer * , integer *, integer *, complex *, real *, integer *); integer ijobvl; - extern /* Subroutine */ int ctgsna_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ctgsna_(char *, char *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, - integer *, real *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + integer *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern real slamch_(char *); integer ijobvr; logical wantsb; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); real anrmto; logical wantse; real bnrmto; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer minwrk, maxwrk; @@ -1106,15 +1106,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1422,7 +1422,7 @@ f"> */ } work[1].r = (real) maxwrk, work[1].i = 0.f; - return 0; + return; /* End of CGGEVX */ diff --git a/lapack-netlib/SRC/cggglm.c b/lapack-netlib/SRC/cggglm.c index 056bd9142c..8165d277b0 100644 --- a/lapack-netlib/SRC/cggglm.c +++ b/lapack-netlib/SRC/cggglm.c @@ -699,7 +699,7 @@ f"> */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int cggglm_(integer *n, integer *m, integer *p, complex *a, +/* Subroutine */ void cggglm_(integer *n, integer *m, integer *p, complex *a, integer *lda, complex *b, integer *ldb, complex *d__, complex *x, complex *y, complex *work, integer *lwork, integer *info) { @@ -709,18 +709,19 @@ f"> */ /* Local variables */ integer lopt, i__; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *); integer nb, np; - extern /* Subroutine */ int cggqrf_(integer *, integer *, integer *, + extern /* Subroutine */ void cggqrf_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, - complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer lwkmin, nb1, nb2, nb3, nb4; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunmrq_(char *, char *, integer *, integer *, integer *, complex *, integer *, @@ -801,9 +802,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGGLM", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -819,7 +820,7 @@ f"> */ i__2 = i__; y[i__2].r = 0.f, y[i__2].i = 0.f; } - return 0; + return; } /* Compute the GQR factorization of matrices A and B: */ @@ -860,7 +861,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } i__1 = *n - *m; @@ -891,7 +892,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Copy D to X */ @@ -914,7 +915,7 @@ f"> */ i__1 = *m + np + f2cmax(i__2,i__3); work[1].r = (real) i__1, work[1].i = 0.f; - return 0; + return; /* End of CGGGLM */ diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 3efca1e713..fb384b6518 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -289,7 +289,7 @@ SUBROUTINE CGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, * CALL CGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = REAL( WORK( M+NP+1 ) ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**H*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/cgghd3.c b/lapack-netlib/SRC/cgghd3.c index a795b803d5..71f56c6bbc 100644 --- a/lapack-netlib/SRC/cgghd3.c +++ b/lapack-netlib/SRC/cgghd3.c @@ -749,7 +749,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgghd3_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void cgghd3_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, complex *a, integer *lda, complex *b, integer *ldb, complex *q, integer *ldq, complex *z__, integer *ldz, complex *work, integer *lwork, integer *info) @@ -763,22 +763,22 @@ f"> */ logical blk22; integer cola, jcol, ierr; complex temp; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer jrow, topq, ppwo; complex temp1, temp2, temp3; real c__; integer kacc22, i__, j, k; complex s; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer nbmin; - extern /* Subroutine */ int cunm22_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunm22_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); complex ctemp; @@ -787,17 +787,17 @@ f"> */ complex c1, c2; logical wantq; integer j0; - extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); logical initz, wantz; complex s1, s2; char compq2[1], compz2[1]; integer nb, jj, nh; - extern /* Subroutine */ int cgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgghrd_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer nx, pw; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, complex *, real *, complex *, complex *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -876,9 +876,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGHD3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -903,7 +903,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Determine the blocksize. */ @@ -1760,7 +1760,7 @@ f"> */ q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGGHD3 */ diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f index 76d7de4ce0..1074b4828e 100644 --- a/lapack-netlib/SRC/cgghd3.f +++ b/lapack-netlib/SRC/cgghd3.f @@ -511,7 +511,7 @@ SUBROUTINE CGGHD3( COMPQ, COMPZ, N, ILO, IHI, A, LDA, B, LDB, Q, * IF( JJ.GT.0 ) THEN DO I = JJ, 1, -1 - C = DBLE( A( J+1+I, J ) ) + C = REAL( A( J+1+I, J ) ) CALL CROT( IHI-TOP, A( TOP+1, J+I+1 ), 1, $ A( TOP+1, J+I ), 1, C, $ -CONJG( B( J+1+I, J ) ) ) diff --git a/lapack-netlib/SRC/cgghrd.c b/lapack-netlib/SRC/cgghrd.c index deddb5b25c..f67b1460b0 100644 --- a/lapack-netlib/SRC/cgghrd.c +++ b/lapack-netlib/SRC/cgghrd.c @@ -718,7 +718,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cgghrd_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void cgghrd_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, complex *a, integer *lda, complex *b, integer *ldb, complex *q, integer *ldq, complex *z__, integer *ldz, integer *info) { @@ -729,17 +729,17 @@ f"> */ /* Local variables */ integer jcol; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer jrow; real c__; complex s; extern logical lsame_(char *, char *); complex ctemp; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, - complex *, real *, complex *, complex *), xerbla_(char *, integer - *, ftnlen); + complex *, real *, complex *, complex *); + extern int xerbla_(char *, integer *, ftnlen); integer icompq, icompz; logical ilq, ilz; @@ -823,7 +823,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGHRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -838,7 +838,7 @@ f"> */ /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Zero out lower triangle of B */ @@ -904,7 +904,7 @@ f"> */ /* L40: */ } - return 0; + return; /* End of CGGHRD */ diff --git a/lapack-netlib/SRC/cgglse.c b/lapack-netlib/SRC/cgglse.c index 80b959a4df..7b35f8fe49 100644 --- a/lapack-netlib/SRC/cgglse.c +++ b/lapack-netlib/SRC/cgglse.c @@ -694,7 +694,7 @@ f"> */ /* > \ingroup complexOTHERsolve */ /* ===================================================================== */ -/* Subroutine */ int cgglse_(integer *m, integer *n, integer *p, complex *a, +/* Subroutine */ void cgglse_(integer *m, integer *n, integer *p, complex *a, integer *lda, complex *b, integer *ldb, complex *c__, complex *d__, complex *x, complex *work, integer *lwork, integer *info) { @@ -704,20 +704,21 @@ f"> */ /* Local variables */ integer lopt; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); integer nb, mn, nr; - extern /* Subroutine */ int cggrqf_(integer *, integer *, integer *, + extern /* Subroutine */ void cggrqf_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, - complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer lwkmin, nb1, nb2, nb3, nb4; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunmrq_(char *, char *, integer *, integer *, integer *, complex *, integer *, @@ -798,15 +799,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGLSE", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the GRQ factorization of matrices B and A: */ @@ -844,7 +845,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } /* Put the solution in X */ @@ -869,7 +870,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Put the solutions in X */ @@ -910,7 +911,7 @@ f"> */ i__1 = *p + mn + f2cmax(i__2,i__3); work[1].r = (real) i__1, work[1].i = 0.f; - return 0; + return; /* End of CGGLSE */ diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f index 4785941dbe..cca20dfed9 100644 --- a/lapack-netlib/SRC/cgglse.f +++ b/lapack-netlib/SRC/cgglse.f @@ -276,7 +276,7 @@ SUBROUTINE CGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, * CALL CGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = REAL( WORK( P+MN+1 ) ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**H *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/cggqrf.c b/lapack-netlib/SRC/cggqrf.c index f1e873b669..27da14abe8 100644 --- a/lapack-netlib/SRC/cggqrf.c +++ b/lapack-netlib/SRC/cggqrf.c @@ -728,7 +728,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggqrf_(integer *n, integer *m, integer *p, complex *a, +/* Subroutine */ void cggqrf_(integer *n, integer *m, integer *p, complex *a, integer *lda, complex *taua, complex *b, integer *ldb, complex *taub, complex *work, integer *lwork, integer *info) { @@ -737,14 +737,15 @@ f"> */ /* Local variables */ integer lopt, nb; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cgerqf_( integer *, integer *, complex *, integer *, complex *, complex *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer nb1, nb2, nb3; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; @@ -808,9 +809,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* QR factorization of N-by-M matrix A: A = Q*R */ @@ -835,7 +836,7 @@ f"> */ i__1 = f2cmax(i__2,i__3); work[1].r = (real) i__1, work[1].i = 0.f; - return 0; + return; /* End of CGGQRF */ diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f index febd9be8de..0185f4e0d9 100644 --- a/lapack-netlib/SRC/cggqrf.f +++ b/lapack-netlib/SRC/cggqrf.f @@ -276,7 +276,7 @@ SUBROUTINE CGGQRF( N, M, P, A, LDA, TAUA, B, LDB, TAUB, WORK, * QR factorization of N-by-M matrix A: A = Q*R * CALL CGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = REAL( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**H*B. * diff --git a/lapack-netlib/SRC/cggrqf.c b/lapack-netlib/SRC/cggrqf.c index 415373382a..d499d70831 100644 --- a/lapack-netlib/SRC/cggrqf.c +++ b/lapack-netlib/SRC/cggrqf.c @@ -727,7 +727,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggrqf_(integer *m, integer *p, integer *n, complex *a, +/* Subroutine */ void cggrqf_(integer *m, integer *p, integer *n, complex *a, integer *lda, complex *taua, complex *b, integer *ldb, complex *taub, complex *work, integer *lwork, integer *info) { @@ -736,14 +736,15 @@ f"> */ /* Local variables */ integer lopt, nb; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cgerqf_( integer *, integer *, complex *, integer *, complex *, complex *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer nb1, nb2, nb3; - extern /* Subroutine */ int cunmrq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmrq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; @@ -807,9 +808,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGGRQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* RQ factorization of M-by-N matrix A: A = R*Q */ @@ -836,7 +837,7 @@ f"> */ i__1 = f2cmax(i__2,i__3); work[1].r = (real) i__1, work[1].i = 0.f; - return 0; + return; /* End of CGGRQF */ diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f index b43febc1f5..5227100dad 100644 --- a/lapack-netlib/SRC/cggrqf.f +++ b/lapack-netlib/SRC/cggrqf.f @@ -275,7 +275,7 @@ SUBROUTINE CGGRQF( M, P, N, A, LDA, TAUA, B, LDB, TAUB, WORK, * RQ factorization of M-by-N matrix A: A = R*Q * CALL CGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = REAL( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**H * diff --git a/lapack-netlib/SRC/cggsvd3.c b/lapack-netlib/SRC/cggsvd3.c index 0e9e0e42dc..b467a606ce 100644 --- a/lapack-netlib/SRC/cggsvd3.c +++ b/lapack-netlib/SRC/cggsvd3.c @@ -866,7 +866,7 @@ static integer c__1 = 1; /* > CGGSVD3 replaces the deprecated subroutine CGGSVD. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void cggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, complex *a, integer * lda, complex *b, integer *ldb, real *alpha, real *beta, complex *u, integer *ldu, complex *v, integer *ldv, complex *q, integer *ldq, @@ -887,20 +887,20 @@ static integer c__1 = 1; extern logical lsame_(char *, char *); real anorm, bnorm; logical wantq; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantu, wantv; extern real clange_(char *, integer *, integer *, complex *, integer *, real *), slamch_(char *); - extern /* Subroutine */ int ctgsja_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctgsja_(char *, char *, char *, integer *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, real *, real *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, - integer *, integer *), xerbla_(char *, - integer *, ftnlen); + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int cggsvp3_(char *, char *, char *, integer *, + extern /* Subroutine */ void cggsvp3_(char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, real *, @@ -996,10 +996,10 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CGGSVD3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1062,7 +1062,7 @@ static integer c__1 = 1; q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGGSVD3 */ diff --git a/lapack-netlib/SRC/cggsvp3.c b/lapack-netlib/SRC/cggsvp3.c index 6f5f3ac75d..a3fc3e4e7b 100644 --- a/lapack-netlib/SRC/cggsvp3.c +++ b/lapack-netlib/SRC/cggsvp3.c @@ -791,7 +791,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void cggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, complex *a, integer *lda, complex *b, integer *ldb, real *tola, real *tolb, integer *k, integer *l, complex *u, integer *ldu, complex *v, integer *ldv, complex *q, integer *ldq, @@ -807,7 +807,7 @@ static integer c_n1 = -1; integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int cgeqp3_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqp3_(integer *, integer *, complex *, integer *, integer *, complex *, complex *, integer *, real *, integer *), cgeqr2_(integer *, integer *, complex *, integer *, complex *, complex *, integer *), cgerq2_(integer *, integer *, @@ -820,8 +820,9 @@ static integer c_n1 = -1; complex *, integer *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer - *, complex *, complex *, complex *, integer *), xerbla_( - char *, integer *, ftnlen), clapmt_(logical *, integer *, integer + *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clapmt_(logical *, integer *, integer *, complex *, integer *, integer *); logical forwrd; integer lwkopt; @@ -927,10 +928,10 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CGGSVP3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1186,7 +1187,7 @@ static integer c_n1 = -1; q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CGGSVP3 */ diff --git a/lapack-netlib/SRC/cgsvj0.c b/lapack-netlib/SRC/cgsvj0.c index 4cb871737b..50c00aa638 100644 --- a/lapack-netlib/SRC/cgsvj0.c +++ b/lapack-netlib/SRC/cgsvj0.c @@ -732,7 +732,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* ===================================================================== */ -/* Subroutine */ int cgsvj0_(char *jobv, integer *m, integer *n, complex *a, +/* Subroutine */ void cgsvj0_(char *jobv, integer *m, integer *n, complex *a, integer *lda, complex *d__, real *sva, integer *mv, complex *v, integer *ldv, real *eps, real *sfmin, real *tol, integer *nsweep, complex *work, integer *lwork, integer *info) @@ -749,7 +749,7 @@ f"> */ real aaqq; integer ierr; real bigtheta; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); complex ompq; integer pskipped; @@ -761,22 +761,23 @@ f"> */ real apoaq, aqoap; extern logical lsame_(char *, char *); real theta, small; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); logical applv, rsvec; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical rotok; real rootsfmin; extern real scnrm2_(integer *, complex *, integer *); real cs, sn; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, - real *, integer *, integer *, complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; extern integer isamax_(integer *, real *, integer *); integer blskip; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); real mxaapq, thsign, mxsinj; integer ir1, emptsw, notrot, iswrot, jbc; @@ -840,7 +841,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGSVJ0", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1661,6 +1662,6 @@ f"> */ /* L5991: */ } - return 0; + return; } /* cgsvj0_ */ diff --git a/lapack-netlib/SRC/cgsvj1.c b/lapack-netlib/SRC/cgsvj1.c index 71c9a063a6..9ee9d043a4 100644 --- a/lapack-netlib/SRC/cgsvj1.c +++ b/lapack-netlib/SRC/cgsvj1.c @@ -751,7 +751,7 @@ f"> */ /* > Zlatko Drmac (Zagreb, Croatia) */ /* ===================================================================== */ -/* Subroutine */ int cgsvj1_(char *jobv, integer *m, integer *n, integer *n1, +/* Subroutine */ void cgsvj1_(char *jobv, integer *m, integer *n, integer *n1, complex *a, integer *lda, complex *d__, real *sva, integer *mv, complex *v, integer *ldv, real *eps, real *sfmin, real *tol, integer * nsweep, complex *work, integer *lwork, integer *info) @@ -769,7 +769,7 @@ f"> */ real aaqq; integer nblr, ierr; real bigtheta; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); complex ompq; integer pskipped; @@ -781,22 +781,23 @@ f"> */ real apoaq, aqoap; extern logical lsame_(char *, char *); real theta, small; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); logical applv, rsvec; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical rotok; real rootsfmin; extern real scnrm2_(integer *, complex *, integer *); real cs, sn; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, - real *, integer *, integer *, complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; extern integer isamax_(integer *, real *, integer *); integer blskip; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); real mxaapq, thsign, mxsinj; integer emptsw, notrot, iswrot, jbc; @@ -860,7 +861,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGSVJ1", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1345,6 +1346,6 @@ f"> */ } - return 0; + return; } /* cgsvj1_ */ diff --git a/lapack-netlib/SRC/cgtcon.c b/lapack-netlib/SRC/cgtcon.c index 5e636caa76..284fc2584c 100644 --- a/lapack-netlib/SRC/cgtcon.c +++ b/lapack-netlib/SRC/cgtcon.c @@ -653,7 +653,7 @@ f"> */ /* > \ingroup complexGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgtcon_(char *norm, integer *n, complex *dl, complex * +/* Subroutine */ void cgtcon_(char *norm, integer *n, complex *dl, complex * d__, complex *du, complex *du2, integer *ipiv, real *anorm, real * rcond, complex *work, integer *info) { @@ -664,11 +664,12 @@ f"> */ integer kase, kase1, i__; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; logical onenrm; - extern /* Subroutine */ int cgttrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgttrs_(char *, integer *, integer *, complex *, complex *, complex *, complex *, integer *, complex *, integer *, integer *); @@ -705,7 +706,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -713,9 +714,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } /* Check that D(1:N) is non-zero. */ @@ -724,7 +725,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; if (d__[i__2].r == 0.f && d__[i__2].i == 0.f) { - return 0; + return; } /* L10: */ } @@ -761,7 +762,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CGTCON */ diff --git a/lapack-netlib/SRC/cgtrfs.c b/lapack-netlib/SRC/cgtrfs.c index 6010d834ef..dd0316af41 100644 --- a/lapack-netlib/SRC/cgtrfs.c +++ b/lapack-netlib/SRC/cgtrfs.c @@ -724,7 +724,7 @@ f"> */ /* > \ingroup complexGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgtrfs_(char *trans, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgtrfs_(char *trans, integer *n, integer *nrhs, complex * dl, complex *d__, complex *du, complex *dlf, complex *df, complex * duf, complex *du2, integer *ipiv, complex *b, integer *ldb, complex * x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, @@ -744,11 +744,11 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *), clagtm_(char *, integer *, integer *, real *, complex *, complex *, complex *, complex *, integer *, real *, complex *, integer *); @@ -758,7 +758,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran; char transn[1]; - extern /* Subroutine */ int cgttrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgttrs_(char *, integer *, integer *, complex *, complex *, complex *, complex *, integer *, complex *, integer *, integer *); char transt[1]; @@ -814,7 +814,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -826,7 +826,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1131,7 +1131,7 @@ f"> */ /* L110: */ } - return 0; + return; /* End of CGTRFS */ diff --git a/lapack-netlib/SRC/cgtsv.c b/lapack-netlib/SRC/cgtsv.c index 1bbbb35d04..acff81cdf8 100644 --- a/lapack-netlib/SRC/cgtsv.c +++ b/lapack-netlib/SRC/cgtsv.c @@ -633,7 +633,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexGTsolve */ /* ===================================================================== */ -/* Subroutine */ int cgtsv_(integer *n, integer *nrhs, complex *dl, complex * +/* Subroutine */ void cgtsv_(integer *n, integer *nrhs, complex *dl, complex * d__, complex *du, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -676,11 +676,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CGTSV ", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } i__1 = *n - 1; @@ -697,7 +697,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* solution can not be found. */ *info = k; - return 0; + return; } } else /* if(complicated condition) */ { i__2 = k; @@ -785,7 +785,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ i__1 = *n; if (d__[i__1].r == 0.f && d__[i__1].i == 0.f) { *info = *n; - return 0; + return; } /* Back solve with the matrix U from the factorization. */ @@ -826,7 +826,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* L50: */ } - return 0; + return; /* End of CGTSV */ diff --git a/lapack-netlib/SRC/cgtsvx.c b/lapack-netlib/SRC/cgtsvx.c index 2bde80f9d8..93956c7678 100644 --- a/lapack-netlib/SRC/cgtsvx.c +++ b/lapack-netlib/SRC/cgtsvx.c @@ -805,7 +805,7 @@ f"> */ /* > \ingroup complexGTsolve */ /* ===================================================================== */ -/* Subroutine */ int cgtsvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void cgtsvx_(char *fact, char *trans, integer *n, integer * nrhs, complex *dl, complex *d__, complex *du, complex *dlf, complex * df, complex *duf, complex *du2, integer *ipiv, complex *b, integer * ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, @@ -818,22 +818,23 @@ f"> */ char norm[1]; extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern real slamch_(char *), clangt_(char *, integer *, complex *, complex *, complex *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), cgtcon_(char *, integer *, complex *, complex *, complex *, complex *, integer *, - real *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen), cgtrfs_(char *, integer *, integer *, complex + real *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgtrfs_(char *, integer *, integer *, complex *, complex *, complex *, complex *, complex *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *), cgttrf_(integer *, complex *, complex *, complex *, complex *, integer *, integer *); logical notran; - extern /* Subroutine */ int cgttrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgttrs_(char *, integer *, integer *, complex *, complex *, complex *, complex *, integer *, complex *, integer *, integer *); @@ -888,7 +889,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -908,7 +909,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -945,7 +946,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CGTSVX */ diff --git a/lapack-netlib/SRC/cgttrf.c b/lapack-netlib/SRC/cgttrf.c index fdd5c0206d..2b6aa9ce8a 100644 --- a/lapack-netlib/SRC/cgttrf.c +++ b/lapack-netlib/SRC/cgttrf.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup complexGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgttrf_(integer *n, complex *dl, complex *d__, complex * +/* Subroutine */ void cgttrf_(integer *n, complex *dl, complex *d__, complex * du, complex *du2, integer *ipiv, integer *info) { /* System generated locals */ @@ -669,13 +669,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("CGTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize IPIV(i) = i and DU2(i) = 0 */ @@ -811,7 +811,7 @@ f"> */ } L50: - return 0; + return; /* End of CGTTRF */ diff --git a/lapack-netlib/SRC/cgttrs.c b/lapack-netlib/SRC/cgttrs.c index 5d6de8f0ff..c846fae8b7 100644 --- a/lapack-netlib/SRC/cgttrs.c +++ b/lapack-netlib/SRC/cgttrs.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup complexGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgttrs_(char *trans, integer *n, integer *nrhs, complex * +/* Subroutine */ void cgttrs_(char *trans, integer *n, integer *nrhs, complex * dl, complex *d__, complex *du, complex *du2, integer *ipiv, complex * b, integer *ldb, integer *info) { @@ -660,7 +660,7 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int cgtts2_(integer *, integer *, integer *, + extern /* Subroutine */ void cgtts2_(integer *, integer *, integer *, complex *, complex *, complex *, complex *, integer *, complex *, integer *); integer jb, nb; @@ -707,13 +707,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CGTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Decode TRANS */ @@ -756,6 +756,6 @@ f"> */ /* End of CGTTRS */ - return 0; + return; } /* cgttrs_ */ diff --git a/lapack-netlib/SRC/cgtts2.c b/lapack-netlib/SRC/cgtts2.c index 25e5245903..153537493d 100644 --- a/lapack-netlib/SRC/cgtts2.c +++ b/lapack-netlib/SRC/cgtts2.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup complexGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cgtts2_(integer *itrans, integer *n, integer *nrhs, +/* Subroutine */ void cgtts2_(integer *itrans, integer *n, integer *nrhs, complex *dl, complex *d__, complex *du, complex *du2, integer *ipiv, complex *b, integer *ldb) { @@ -674,7 +674,7 @@ f"> */ /* Function Body */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (*itrans == 0) { @@ -1142,6 +1142,6 @@ f"> */ /* End of CGTTS2 */ - return 0; + return; } /* cgtts2_ */ diff --git a/lapack-netlib/SRC/chb2st_kernels.c b/lapack-netlib/SRC/chb2st_kernels.c index 99e54dfce9..58f3d3d942 100644 --- a/lapack-netlib/SRC/chb2st_kernels.c +++ b/lapack-netlib/SRC/chb2st_kernels.c @@ -680,7 +680,7 @@ kernels.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chb2st_kernels_(char *uplo, logical *wantz, integer * +/* Subroutine */ void chb2st_kernels_(char *uplo, logical *wantz, integer * ttype, integer *st, integer *ed, integer *sweep, integer *n, integer * nb, integer *ib, complex *a, integer *lda, complex *v, complex *tau, integer *ldvt, complex *work) @@ -695,10 +695,10 @@ kernels.f"> */ extern logical lsame_(char *, char *); logical upper; integer j1, j2, lm, ln; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *); integer ajeter; - extern /* Subroutine */ int clarfx_(char *, integer *, integer *, complex + extern /* Subroutine */ void clarfx_(char *, integer *, integer *, complex *, complex *, complex *, integer *, complex *), clarfy_( char *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); @@ -917,7 +917,7 @@ kernels.f"> */ } } - return 0; + return; /* END OF CHB2ST_KERNELS */ diff --git a/lapack-netlib/SRC/chbev.c b/lapack-netlib/SRC/chbev.c index 228212d68c..d5a40cc93a 100644 --- a/lapack-netlib/SRC/chbev.c +++ b/lapack-netlib/SRC/chbev.c @@ -666,7 +666,7 @@ atrices */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chbev_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void chbev_(char *jobz, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *w, complex *z__, integer *ldz, complex *work, real *rwork, integer *info) { @@ -681,12 +681,12 @@ atrices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical lower, wantz; extern real clanhb_(char *, char *, integer *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), chbtrd_(char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *); @@ -695,7 +695,7 @@ atrices */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indrwk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); real smlnum, eps; @@ -745,13 +745,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -766,7 +766,7 @@ atrices */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -827,7 +827,7 @@ atrices */ sscal_(&imax, &r__1, &w[1], &c__1); } - return 0; + return; /* End of CHBEV */ diff --git a/lapack-netlib/SRC/chbev_2stage.c b/lapack-netlib/SRC/chbev_2stage.c index e39b8ee5c3..6e75065d8e 100644 --- a/lapack-netlib/SRC/chbev_2stage.c +++ b/lapack-netlib/SRC/chbev_2stage.c @@ -729,7 +729,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chbev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void chbev_2stage_(char *jobz, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer * info) @@ -739,7 +739,7 @@ stage.f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int chetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void chetrd_hb2st_(char *, char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *, integer *); integer inde; @@ -750,7 +750,7 @@ stage.f"> */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lhtrd, lwmin; logical lower; integer lwtrd; @@ -759,14 +759,14 @@ stage.f"> */ extern real clanhb_(char *, char *, integer *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indwrk, indrwk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); integer llwork; @@ -842,15 +842,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -865,7 +865,7 @@ stage.f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -935,7 +935,7 @@ stage.f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHBEV_2STAGE */ diff --git a/lapack-netlib/SRC/chbevd.c b/lapack-netlib/SRC/chbevd.c index 9b0c599db7..a15df5cf48 100644 --- a/lapack-netlib/SRC/chbevd.c +++ b/lapack-netlib/SRC/chbevd.c @@ -731,7 +731,7 @@ f"> */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chbevd_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void chbevd_(char *jobz, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer *lrwork, integer * iwork, integer *liwork, integer *info) @@ -746,13 +746,13 @@ f"> */ integer imax; real rmin, rmax; integer llwk2; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lwmin; logical lower; integer llrwk; @@ -761,20 +761,20 @@ f"> */ extern real clanhb_(char *, char *, integer *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), cstedc_(char *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *, integer *, integer *), chbtrd_(char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin; real smlnum; logical lquery; @@ -860,15 +860,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -878,7 +878,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -948,7 +948,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHBEVD */ diff --git a/lapack-netlib/SRC/chbevd_2stage.c b/lapack-netlib/SRC/chbevd_2stage.c index 176ff6d7f6..d5dbd357cd 100644 --- a/lapack-netlib/SRC/chbevd_2stage.c +++ b/lapack-netlib/SRC/chbevd_2stage.c @@ -779,7 +779,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chbevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void chbevd_2stage_(char *jobz, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer * lrwork, integer *iwork, integer *liwork, integer *info) @@ -789,7 +789,7 @@ static integer c__1 = 1; real r__1; /* Local variables */ - extern /* Subroutine */ int chetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void chetrd_hb2st_(char *, char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *, integer *); integer inde; @@ -799,13 +799,13 @@ static integer c__1 = 1; integer imax; real rmin, rmax; integer llwk2; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer indwk, lhtrd, lwmin; logical lower; integer lwtrd, llrwk; @@ -814,18 +814,18 @@ static integer c__1 = 1; extern real clanhb_(char *, char *, integer *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), cstedc_(char *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indrwk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin, llwork; real smlnum; logical lquery; @@ -918,15 +918,15 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CHBEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -936,7 +936,7 @@ static integer c__1 = 1; i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -1011,7 +1011,7 @@ static integer c__1 = 1; work[1].r = (real) lwmin, work[1].i = 0.f; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHBEVD_2STAGE */ diff --git a/lapack-netlib/SRC/chbevx.c b/lapack-netlib/SRC/chbevx.c index f9c8678537..e854802569 100644 --- a/lapack-netlib/SRC/chbevx.c +++ b/lapack-netlib/SRC/chbevx.c @@ -782,7 +782,7 @@ f"> */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chbevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void chbevx_(char *jobz, char *range, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, complex *q, integer *ldq, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer * m, real *w, complex *z__, integer *ldz, complex *work, real *rwork, @@ -803,17 +803,17 @@ f"> */ integer itmp1, i__, j, indee; real sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); logical lower; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer jj; @@ -821,27 +821,27 @@ f"> */ integer *, real *); logical alleig, indeig; integer iscale, indibl; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), chbtrd_(char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *); logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indiwk, indisp; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -919,14 +919,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -951,7 +951,7 @@ f"> */ z__[i__1].r = 1.f, z__[i__1].i = 0.f; } } - return 0; + return; } /* Get machine constants. */ @@ -1127,7 +1127,7 @@ f"> */ } } - return 0; + return; /* End of CHBEVX */ diff --git a/lapack-netlib/SRC/chbevx_2stage.c b/lapack-netlib/SRC/chbevx_2stage.c index 96f3aa7a2e..1572222572 100644 --- a/lapack-netlib/SRC/chbevx_2stage.c +++ b/lapack-netlib/SRC/chbevx_2stage.c @@ -845,7 +845,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chbevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void chbevx_2stage_(char *jobz, char *range, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, complex *q, integer *ldq, real *vl, real *vu, integer *il, integer *iu, real * abstol, integer *m, real *w, complex *z__, integer *ldz, complex * @@ -858,7 +858,7 @@ static integer c__1 = 1; real r__1, r__2; /* Local variables */ - extern /* Subroutine */ int chetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void chetrd_hb2st_(char *, char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *, integer *); integer indd, inde; @@ -872,20 +872,20 @@ static integer c__1 = 1; integer itmp1, i__, j, indee; real sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer lhtrd; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); integer lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer ib, jj; @@ -893,26 +893,26 @@ static integer c__1 = 1; integer *, real *); logical alleig, indeig; integer iscale, indibl; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indiwk, indisp; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit, llwork; real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -1015,16 +1015,16 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CHBEVX_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1049,7 +1049,7 @@ static integer c__1 = 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } } - return 0; + return; } /* Get machine constants. */ @@ -1234,7 +1234,7 @@ static integer c__1 = 1; work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHBEVX_2STAGE */ diff --git a/lapack-netlib/SRC/chbgst.c b/lapack-netlib/SRC/chbgst.c index 6836059ca5..416ce9ec4c 100644 --- a/lapack-netlib/SRC/chbgst.c +++ b/lapack-netlib/SRC/chbgst.c @@ -679,7 +679,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chbgst_(char *vect, char *uplo, integer *n, integer *ka, +/* Subroutine */ void chbgst_(char *vect, char *uplo, integer *n, integer *ka, integer *kb, complex *ab, integer *ldab, complex *bb, integer *ldbb, complex *x, integer *ldx, complex *work, real *rwork, integer *info) { @@ -691,32 +691,33 @@ f"> */ /* Local variables */ integer inca; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer i__, j, k, l, m; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgeru_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); integer i0, i1; logical upper; integer i2, j1, j2; logical wantx; - extern /* Subroutine */ int clar2v_(integer *, complex *, complex *, + extern /* Subroutine */ void clar2v_(integer *, complex *, complex *, complex *, integer *, real *, complex *, integer *); complex ra; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); integer nr, nx; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, complex *, real - *, complex *, complex *), xerbla_(char *, integer *, ftnlen), - clargv_(integer *, complex *, integer *, complex *, integer *, + *, complex *, complex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clargv_(integer *, complex *, integer *, complex *, integer *, real *, integer *); logical update; - extern /* Subroutine */ int clartv_(integer *, complex *, integer *, + extern /* Subroutine */ void clartv_(integer *, complex *, integer *, complex *, integer *, real *, complex *, integer *); integer ka1, kb1; complex ra1; @@ -775,13 +776,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } inca = *ldab * ka1; @@ -1811,14 +1812,14 @@ f"> */ --i__; i0 = m + 1; if (*ka == 0) { - return 0; + return; } goto L490; } } else { i__ -= *ka; if (i__ < 2) { - return 0; + return; } } diff --git a/lapack-netlib/SRC/chbgv.c b/lapack-netlib/SRC/chbgv.c index 4d5cbc1cf8..1a5c67c606 100644 --- a/lapack-netlib/SRC/chbgv.c +++ b/lapack-netlib/SRC/chbgv.c @@ -691,7 +691,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chbgv_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void chbgv_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, complex *ab, integer *ldab, complex *bb, integer *ldbb, real *w, complex *z__, integer *ldz, complex *work, real *rwork, integer *info) @@ -705,14 +705,16 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); integer iinfo; logical upper, wantz; - extern /* Subroutine */ int chbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chbtrd_(char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *), chbgst_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, - integer *, complex *, integer *, complex *, real *, integer *), xerbla_(char *, integer *, ftnlen), cpbstf_(char + integer *, complex *, integer *, complex *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cpbstf_(char *, integer *, integer *, complex *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); @@ -767,13 +769,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CHBGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -781,7 +783,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ cpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -809,7 +811,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ csteqr_(jobz, n, &w[1], &rwork[inde], &z__[z_offset], ldz, &rwork[ indwrk], info); } - return 0; + return; /* End of CHBGV */ diff --git a/lapack-netlib/SRC/chbgvd.c b/lapack-netlib/SRC/chbgvd.c index 6fb7e6795c..c1aada39fe 100644 --- a/lapack-netlib/SRC/chbgvd.c +++ b/lapack-netlib/SRC/chbgvd.c @@ -764,7 +764,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int chbgvd_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void chbgvd_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, complex *ab, integer *ldab, complex *bb, integer *ldbb, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer *lrwork, integer *iwork, integer *liwork, @@ -777,7 +777,7 @@ f"> */ integer inde; char vect[1]; integer llwk2; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); @@ -786,7 +786,7 @@ f"> */ integer llrwk; logical wantz; integer indwk2; - extern /* Subroutine */ int cstedc_(char *, integer *, real *, real *, + extern /* Subroutine */ void cstedc_(char *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *, integer *, integer *), chbtrd_(char *, char *, integer *, integer *, complex *, integer *, real *, real *, @@ -794,11 +794,12 @@ f"> */ chbgst_(char *, char *, integer *, integer *, integer *, complex * , integer *, complex *, integer *, complex *, integer *, complex * , real *, integer *), clacpy_(char *, integer *, - integer *, complex *, integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen), cpbstf_(char *, integer *, + integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cpbstf_(char *, integer *, integer *, complex *, integer *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin; logical lquery; @@ -887,15 +888,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -903,7 +904,7 @@ f"> */ cpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -941,7 +942,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHBGVD */ diff --git a/lapack-netlib/SRC/chbgvx.c b/lapack-netlib/SRC/chbgvx.c index eb81c8a2f1..191399fa57 100644 --- a/lapack-netlib/SRC/chbgvx.c +++ b/lapack-netlib/SRC/chbgvx.c @@ -813,7 +813,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int chbgvx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void chbgvx_(char *jobz, char *range, char *uplo, integer *n, integer *ka, integer *kb, complex *ab, integer *ldab, complex *bb, integer *ldbb, complex *q, integer *ldq, real *vl, real *vu, integer * il, integer *iu, real *abstol, integer *m, real *w, complex *z__, @@ -830,41 +830,43 @@ f"> */ logical test; integer itmp1, i__, j, indee; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer iinfo; char order[1]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer jj; logical alleig, indeig; integer indibl; - extern /* Subroutine */ int chbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chbtrd_(char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *); logical valeig; - extern /* Subroutine */ int chbgst_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chbgst_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, real *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, - integer *), xerbla_(char *, integer *, ftnlen), cpbstf_( + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cpbstf_( char *, integer *, integer *, complex *, integer *, integer *); integer indiwk, indisp; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -949,14 +951,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -964,7 +966,7 @@ f"> */ cpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -1092,7 +1094,7 @@ f"> */ } } - return 0; + return; /* End of CHBGVX */ diff --git a/lapack-netlib/SRC/chbgvx.f b/lapack-netlib/SRC/chbgvx.f index 57cf51a551..6b37a4127a 100644 --- a/lapack-netlib/SRC/chbgvx.f +++ b/lapack-netlib/SRC/chbgvx.f @@ -327,7 +327,7 @@ SUBROUTINE CHBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, UPPER, VALEIG, WANTZ CHARACTER ORDER, VECT - INTEGER I, IINFO, INDD, INDE, INDEE, INDIBL, INDISP, + INTEGER I, IINFO, INDD, INDE, INDEE, INDISP, $ INDIWK, INDRWK, INDWRK, ITMP1, J, JJ, NSPLIT REAL TMP1 * .. @@ -470,17 +470,16 @@ SUBROUTINE CHBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWK = INDISP + N CALL SSTEBZ( RANGE, ORDER, N, VL, VU, IL, IU, ABSTOL, $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( 1 ), IWORK( INDISP ), RWORK( INDRWK ), $ IWORK( INDIWK ), INFO ) * IF( WANTZ ) THEN CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) * * Apply unitary matrix used in reduction to tridiagonal @@ -510,11 +509,11 @@ SUBROUTINE CHBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, 40 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/chbtrd.c b/lapack-netlib/SRC/chbtrd.c index 2c91004e39..d748da9172 100644 --- a/lapack-netlib/SRC/chbtrd.c +++ b/lapack-netlib/SRC/chbtrd.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chbtrd_(char *vect, char *uplo, integer *n, integer *kd, +/* Subroutine */ void chbtrd_(char *vect, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *d__, real *e, complex *q, integer * ldq, complex *work, integer *info) { @@ -692,24 +692,25 @@ f"> */ real abst; integer incx, last; complex temp; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer j1end, j1inc, i__, j, k, l; complex t; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); integer iqend; extern logical lsame_(char *, char *); logical initq, wantq, upper; integer i2, j1, j2; - extern /* Subroutine */ int clar2v_(integer *, complex *, complex *, + extern /* Subroutine */ void clar2v_(integer *, complex *, complex *, complex *, integer *, real *, complex *, integer *); integer nq, nr, iqaend; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, complex *, real - *, complex *, complex *), xerbla_(char *, integer *, ftnlen), - clargv_(integer *, complex *, integer *, complex *, integer *, + *, complex *, complex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clargv_(integer *, complex *, integer *, complex *, integer *, real *, integer *), clartv_(integer *, complex *, integer *, complex *, integer *, real *, complex *, integer *); integer kd1, ibl, iqb, kdn, jin, nrt, kdm1; @@ -763,13 +764,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHBTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize Q to the unit matrix, if needed */ @@ -1367,7 +1368,7 @@ f"> */ } } - return 0; + return; /* End of CHBTRD */ diff --git a/lapack-netlib/SRC/checon.c b/lapack-netlib/SRC/checon.c index 4cfcea0fdf..8539008965 100644 --- a/lapack-netlib/SRC/checon.c +++ b/lapack-netlib/SRC/checon.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int checon_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void checon_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, real *anorm, real *rcond, complex *work, integer * info) { @@ -649,10 +649,11 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int chetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void chetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -689,7 +690,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHECON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -697,9 +698,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -711,7 +712,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -723,7 +724,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -749,7 +750,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CHECON */ diff --git a/lapack-netlib/SRC/checon_3.c b/lapack-netlib/SRC/checon_3.c index 5794b8c7da..b5318afc46 100644 --- a/lapack-netlib/SRC/checon_3.c +++ b/lapack-netlib/SRC/checon_3.c @@ -678,7 +678,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int checon_3_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void checon_3_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, real *anorm, real *rcond, complex * work, integer *info) { @@ -690,10 +690,11 @@ static integer c__1 = 1; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int chetrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void chetrs_3_(char *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -732,7 +733,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CHECON_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ @@ -740,9 +741,9 @@ static integer c__1 = 1; *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -754,7 +755,7 @@ static integer c__1 = 1; for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } } } else { @@ -765,7 +766,7 @@ static integer c__1 = 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } } } @@ -790,7 +791,7 @@ static integer c__1 = 1; *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CHECON_3 */ diff --git a/lapack-netlib/SRC/checon_rook.c b/lapack-netlib/SRC/checon_rook.c index 88a8c80bb6..fda6cc6315 100644 --- a/lapack-netlib/SRC/checon_rook.c +++ b/lapack-netlib/SRC/checon_rook.c @@ -652,7 +652,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int checon_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void checon_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, real *anorm, real *rcond, complex *work, integer *info) { @@ -664,10 +664,11 @@ rook.f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int chetrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void chetrs_rook_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -704,7 +705,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHECON_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ @@ -712,9 +713,9 @@ rook.f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -726,7 +727,7 @@ rook.f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -738,7 +739,7 @@ rook.f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -764,7 +765,7 @@ rook.f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CHECON_ROOK */ diff --git a/lapack-netlib/SRC/cheequb.c b/lapack-netlib/SRC/cheequb.c index 53d07b9742..141ceee143 100644 --- a/lapack-netlib/SRC/cheequb.c +++ b/lapack-netlib/SRC/cheequb.c @@ -645,7 +645,7 @@ static integer c__1 = 1; /* > Tech report version: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.1679 */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cheequb_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void cheequb_(char *uplo, integer *n, complex *a, integer * lda, real *s, real *scond, real *amax, complex *work, integer *info) { /* System generated locals */ @@ -666,7 +666,7 @@ static integer c__1 = 1; extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); real smlnum, avg, std, tol; @@ -701,7 +701,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CHEEQUB", &i__1, (ftnlen)7); - return 0; + return; } up = lsame_(uplo, "U"); *amax = 0.f; @@ -710,7 +710,7 @@ static integer c__1 = 1; if (*n == 0) { *scond = 1.f; - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -905,7 +905,7 @@ static integer c__1 = 1; d__ = c1 * c1 - c0 * 4 * c2; if (d__ <= 0.f) { *info = -1; - return 0; + return; } si = c0 * -2 / (c1 + sqrt(d__)); d__ = si - s[i__]; @@ -992,6 +992,6 @@ static integer c__1 = 1; } *scond = f2cmax(smin,smlnum) / f2cmin(smax,bignum); - return 0; + return; } /* cheequb_ */ diff --git a/lapack-netlib/SRC/cheev.c b/lapack-netlib/SRC/cheev.c index 9ef5c529e7..03e05dc60c 100644 --- a/lapack-netlib/SRC/cheev.c +++ b/lapack-netlib/SRC/cheev.c @@ -656,7 +656,7 @@ ices */ /* > \ingroup complexHEeigen */ /* ===================================================================== */ -/* Subroutine */ int cheev_(char *jobz, char *uplo, integer *n, complex *a, +/* Subroutine */ void cheev_(char *jobz, char *uplo, integer *n, complex *a, integer *lda, real *w, complex *work, integer *lwork, real *rwork, integer *info) { @@ -671,16 +671,16 @@ ices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical lower, wantz; integer nb; extern real clanhe_(char *, char *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int chetrd_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetrd_(char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, integer *); real safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, @@ -688,7 +688,7 @@ ices */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), cungtr_(char *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), ssterf_(integer *, real *, real *, integer *); @@ -752,15 +752,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -771,7 +771,7 @@ ices */ i__1 = a_dim1 + 1; a[i__1].r = 1.f, a[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -837,7 +837,7 @@ ices */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHEEV */ diff --git a/lapack-netlib/SRC/cheev_2stage.c b/lapack-netlib/SRC/cheev_2stage.c index e1ab3e60f2..4aec6a9335 100644 --- a/lapack-netlib/SRC/cheev_2stage.c +++ b/lapack-netlib/SRC/cheev_2stage.c @@ -708,7 +708,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int cheev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void cheev_2stage_(char *jobz, char *uplo, integer *n, complex *a, integer *lda, real *w, complex *work, integer *lwork, real *rwork, integer *info) { @@ -723,13 +723,13 @@ stage.f"> */ real anrm; integer imax; real rmin, rmax; - extern /* Subroutine */ int chetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void chetrd_2stage_(char *, char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, complex *, integer *, integer *); real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lhtrd, lwmin; logical lower; integer lwtrd; @@ -738,14 +738,14 @@ stage.f"> */ extern real clanhe_(char *, char *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); extern real slamch_(char *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), cungtr_(char *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), ssterf_(integer *, real *, real *, integer *); @@ -811,15 +811,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -830,7 +830,7 @@ stage.f"> */ i__1 = a_dim1 + 1; a[i__1].r = 1.f, a[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -899,7 +899,7 @@ stage.f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHEEV_2STAGE */ diff --git a/lapack-netlib/SRC/cheevd.c b/lapack-netlib/SRC/cheevd.c index a6a4e33a7a..0f494f4266 100644 --- a/lapack-netlib/SRC/cheevd.c +++ b/lapack-netlib/SRC/cheevd.c @@ -721,7 +721,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cheevd_(char *jobz, char *uplo, integer *n, complex *a, +/* Subroutine */ void cheevd_(char *jobz, char *uplo, integer *n, complex *a, integer *lda, real *w, complex *work, integer *lwork, real *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) { @@ -738,7 +738,7 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lwmin, liopt; logical lower; integer llrwk, lropt; @@ -747,12 +747,12 @@ f"> */ extern real clanhe_(char *, char *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), cstedc_(char *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int chetrd_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetrd_(char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; @@ -761,9 +761,9 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau, indrwk, indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin; - extern /* Subroutine */ int cunmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cunmtr_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer llwork; @@ -851,15 +851,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -869,7 +869,7 @@ f"> */ i__1 = a_dim1 + 1; a[i__1].r = 1.f, a[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -942,7 +942,7 @@ f"> */ rwork[1] = (real) lropt; iwork[1] = liopt; - return 0; + return; /* End of CHEEVD */ diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f index 9a4a1efb7d..2ddf74b985 100644 --- a/lapack-netlib/SRC/cheevd.f +++ b/lapack-netlib/SRC/cheevd.f @@ -284,7 +284,7 @@ SUBROUTINE CHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LIWMIN = 1 END IF LOPT = MAX( LWMIN, N + - $ ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) ) LROPT = LRWMIN LIOPT = LIWMIN END IF diff --git a/lapack-netlib/SRC/cheevd_2stage.c b/lapack-netlib/SRC/cheevd_2stage.c index 8bc56d0926..8ded0446c5 100644 --- a/lapack-netlib/SRC/cheevd_2stage.c +++ b/lapack-netlib/SRC/cheevd_2stage.c @@ -772,7 +772,7 @@ static real c_b28 = 1.f; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int cheevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void cheevd_2stage_(char *jobz, char *uplo, integer *n, complex *a, integer *lda, real *w, complex *work, integer *lwork, real *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -788,13 +788,13 @@ static real c_b28 = 1.f; real anrm; integer imax; real rmin, rmax; - extern /* Subroutine */ int chetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void chetrd_2stage_(char *, char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, complex *, integer *, integer *); real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lhtrd, lwmin; logical lower; integer llrwk, lwtrd; @@ -803,20 +803,20 @@ static real c_b28 = 1.f; extern real clanhe_(char *, char *, integer *, complex *, integer *, real *); integer iscale; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), cstedc_(char *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau, indrwk, indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin; - extern /* Subroutine */ int cunmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cunmtr_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer llwork; @@ -905,15 +905,15 @@ static real c_b28 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("CHEEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -923,7 +923,7 @@ static real c_b28 = 1.f; i__1 = a_dim1 + 1; a[i__1].r = 1.f, a[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -999,7 +999,7 @@ static real c_b28 = 1.f; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHEEVD_2STAGE */ diff --git a/lapack-netlib/SRC/cheevr.c b/lapack-netlib/SRC/cheevr.c index bf99e339b9..0799305adf 100644 --- a/lapack-netlib/SRC/cheevr.c +++ b/lapack-netlib/SRC/cheevr.c @@ -874,7 +874,7 @@ f"> */ /* > California at Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cheevr_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void cheevr_(char *jobz, char *range, char *uplo, integer *n, complex *a, integer *lda, real *vl, real *vu, integer *il, integer * iu, real *abstol, integer *m, real *w, complex *z__, integer *ldz, integer *isuppz, complex *work, integer *lwork, real *rwork, integer * @@ -893,14 +893,14 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer indwk; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer lwmin; logical lower; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer nb, jj; @@ -908,7 +908,7 @@ f"> */ integer iscale, ieeeok, indibl, indrdd, indifl, indree; logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int chetrd_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetrd_(char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, integer *), csscal_(integer *, real *, complex *, integer *); real safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, @@ -916,22 +916,22 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indiwo, indwkn; extern real clansy_(char *, char *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int cstemr_(char *, char *, integer *, real *, + extern /* Subroutine */ void cstemr_(char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, complex *, integer *, integer *, integer *, logical *, real *, integer *, integer *, integer *, integer *); integer indrwk, liwmin; logical tryrac; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin, llwrkn, llwork, nsplit; real smlnum; - extern /* Subroutine */ int cunmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cunmtr_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), sstebz_( char *, char *, integer *, real *, real *, integer *, integer *, @@ -1046,9 +1046,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEEVR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1056,7 +1056,7 @@ f"> */ *m = 0; if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (*n == 1) { @@ -1080,7 +1080,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1300,7 +1300,7 @@ f"> */ rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHEEVR */ diff --git a/lapack-netlib/SRC/cheevr_2stage.c b/lapack-netlib/SRC/cheevr_2stage.c index 9ea9f950e9..73bbe617d8 100644 --- a/lapack-netlib/SRC/cheevr_2stage.c +++ b/lapack-netlib/SRC/cheevr_2stage.c @@ -922,7 +922,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int cheevr_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void cheevr_2stage_(char *jobz, char *range, char *uplo, integer *n, complex *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, complex *z__, integer *ldz, integer *isuppz, complex *work, integer *lwork, real * @@ -941,22 +941,22 @@ static integer c_n1 = -1; real rmin, rmax; logical test; integer itmp1, i__, j; - extern /* Subroutine */ int chetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void chetrd_2stage_(char *, char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, complex *, integer *, integer *); integer indrd, indre; real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer indwk, lhtrd; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer ib, kd, jj; @@ -964,7 +964,7 @@ static integer c_n1 = -1; integer iscale, ieeeok, indibl, indrdd, indifl, indree; logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, @@ -972,22 +972,22 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indiwo, indwkn; extern real clansy_(char *, char *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int cstemr_(char *, char *, integer *, real *, + extern /* Subroutine */ void cstemr_(char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, complex *, integer *, integer *, integer *, logical *, real *, integer *, integer *, integer *, integer *); integer indrwk, liwmin; logical tryrac; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin, llwrkn, llwork, nsplit; real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), cunmtr_(char *, char *, char *, @@ -1095,9 +1095,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CHEEVR_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1105,7 +1105,7 @@ static integer c_n1 = -1; *m = 0; if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (*n == 1) { @@ -1129,7 +1129,7 @@ static integer c_n1 = -1; isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1351,7 +1351,7 @@ static integer c_n1 = -1; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHEEVR_2STAGE */ diff --git a/lapack-netlib/SRC/cheevx.c b/lapack-netlib/SRC/cheevx.c index 2474c6cd77..c68954121f 100644 --- a/lapack-netlib/SRC/cheevx.c +++ b/lapack-netlib/SRC/cheevx.c @@ -772,7 +772,7 @@ f"> */ /* > \ingroup complexHEeigen */ /* ===================================================================== */ -/* Subroutine */ int cheevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void cheevx_(char *jobz, char *range, char *uplo, integer *n, complex *a, integer *lda, real *vl, real *vu, integer *il, integer * iu, real *abstol, integer *m, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer *iwork, integer * @@ -792,12 +792,12 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical lower; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer nb, jj; @@ -807,7 +807,7 @@ f"> */ integer iscale, indibl; logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), chetrd_(char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, @@ -818,11 +818,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indiwk, indisp, indtau; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indrwk, indwrk, lwkmin; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), cungtr_(char *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), ssterf_(integer *, real *, real *, integer *), @@ -831,7 +831,7 @@ f"> */ integer *); integer nsplit, llwork; real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -928,16 +928,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -958,7 +958,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -1141,7 +1141,7 @@ f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHEEVX */ diff --git a/lapack-netlib/SRC/cheevx_2stage.c b/lapack-netlib/SRC/cheevx_2stage.c index beec82da78..3bd68fd658 100644 --- a/lapack-netlib/SRC/cheevx_2stage.c +++ b/lapack-netlib/SRC/cheevx_2stage.c @@ -822,7 +822,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int cheevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void cheevx_2stage_(char *jobz, char *range, char *uplo, integer *n, complex *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer * @@ -841,22 +841,22 @@ static integer c__4 = 4; real rmin, rmax; logical test; integer itmp1, i__, j; - extern /* Subroutine */ int chetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void chetrd_2stage_(char *, char *, integer *, complex *, integer *, real *, real *, complex *, complex *, integer *, complex *, integer *, integer *); integer indee; real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer lhtrd; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; integer ib, kd, jj; @@ -866,18 +866,18 @@ static integer c__4 = 4; integer iscale, indibl; logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indiwk, indisp, indtau; - extern /* Subroutine */ int cstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), cungtr_(char *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), ssterf_(integer *, real *, real *, integer *), @@ -886,7 +886,7 @@ static integer c__4 = 4; integer *); integer nsplit, llwork; real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -984,16 +984,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("CHEEVX_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1014,7 +1014,7 @@ static integer c__4 = 4; i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -1200,7 +1200,7 @@ static integer c__4 = 4; work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHEEVX_2STAGE */ diff --git a/lapack-netlib/SRC/chegs2.c b/lapack-netlib/SRC/chegs2.c index a74f77a13e..14762a5cfd 100644 --- a/lapack-netlib/SRC/chegs2.c +++ b/lapack-netlib/SRC/chegs2.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chegs2_(integer *itype, char *uplo, integer *n, complex * +/* Subroutine */ void chegs2_(integer *itype, char *uplo, integer *n, complex * a, integer *lda, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -652,20 +652,20 @@ f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex * + extern /* Subroutine */ void cher2_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, integer *); integer k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *), ctrsv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); complex ct; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real akk, bkk; @@ -705,7 +705,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEGS2", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -883,7 +883,7 @@ f"> */ } } } - return 0; + return; /* End of CHEGS2 */ diff --git a/lapack-netlib/SRC/chegst.c b/lapack-netlib/SRC/chegst.c index 7285356a9e..5140e846d2 100644 --- a/lapack-netlib/SRC/chegst.c +++ b/lapack-netlib/SRC/chegst.c @@ -645,7 +645,7 @@ f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chegst_(integer *itype, char *uplo, integer *n, complex * +/* Subroutine */ void chegst_(integer *itype, char *uplo, integer *n, complex * a, integer *lda, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -654,17 +654,17 @@ f"> */ /* Local variables */ integer k; - extern /* Subroutine */ int chemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void chemm_(char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int chegs2_(integer *, char *, integer *, complex + extern /* Subroutine */ void chegs2_(integer *, char *, integer *, complex *, integer *, complex *, integer *, integer *), cher2k_( char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, real *, complex *, integer *); @@ -710,13 +710,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -900,7 +900,7 @@ f"> */ } } } - return 0; + return; /* End of CHEGST */ diff --git a/lapack-netlib/SRC/chegv.c b/lapack-netlib/SRC/chegv.c index eee1f048b1..599f47ffb9 100644 --- a/lapack-netlib/SRC/chegv.c +++ b/lapack-netlib/SRC/chegv.c @@ -695,7 +695,7 @@ static integer c_n1 = -1; /* > \ingroup complexHEeigen */ /* ===================================================================== */ -/* Subroutine */ int chegv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void chegv_(integer *itype, char *jobz, char *uplo, integer * n, complex *a, integer *lda, complex *b, integer *ldb, real *w, complex *work, integer *lwork, real *rwork, integer *info) { @@ -704,23 +704,24 @@ static integer c_n1 = -1; /* Local variables */ integer neig; - extern /* Subroutine */ int cheev_(char *, char *, integer *, complex *, + extern /* Subroutine */ void cheev_(char *, char *, integer *, complex *, integer *, real *, complex *, integer *, real *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); char trans[1]; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper, wantz; integer nb; - extern /* Subroutine */ int chegst_(integer *, char *, integer *, complex + extern /* Subroutine */ void chegst_(integer *, char *, integer *, complex *, integer *, complex *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpotrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int cpotrf_( char *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; @@ -786,15 +787,15 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CHEGV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -802,7 +803,7 @@ static integer c_n1 = -1; cpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -851,7 +852,7 @@ static integer c_n1 = -1; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHEGV */ diff --git a/lapack-netlib/SRC/chegv_2stage.c b/lapack-netlib/SRC/chegv_2stage.c index 7106af1f7c..340820c9bb 100644 --- a/lapack-netlib/SRC/chegv_2stage.c +++ b/lapack-netlib/SRC/chegv_2stage.c @@ -749,7 +749,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chegv_2stage_(integer *itype, char *jobz, char *uplo, +/* Subroutine */ void chegv_2stage_(integer *itype, char *jobz, char *uplo, integer *n, complex *a, integer *lda, complex *b, integer *ldb, real * w, complex *work, integer *lwork, real *rwork, integer *info) { @@ -762,24 +762,26 @@ stage.f"> */ integer *, integer *, integer *); extern logical lsame_(char *, char *); integer lhtrd; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer lwmin; char trans[1]; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; integer lwtrd; logical wantz; integer ib, kd; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chegst_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chegst_( integer *, char *, integer *, complex *, integer *, complex *, - integer *, integer *), cpotrf_(char *, integer *, complex + integer *, integer *); + extern int cpotrf_(char *, integer *, complex *, integer *, integer *); logical lquery; - extern /* Subroutine */ int cheev_2stage_(char *, char *, integer *, + extern /* Subroutine */ void cheev_2stage_(char *, char *, integer *, complex *, integer *, real *, complex *, integer *, real *, integer *); @@ -846,15 +848,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEGV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -862,7 +864,7 @@ stage.f"> */ cpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -911,7 +913,7 @@ stage.f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHEGV_2STAGE */ diff --git a/lapack-netlib/SRC/chegvd.c b/lapack-netlib/SRC/chegvd.c index 855dc1e262..1259b737fa 100644 --- a/lapack-netlib/SRC/chegvd.c +++ b/lapack-netlib/SRC/chegvd.c @@ -761,7 +761,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chegvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void chegvd_(integer *itype, char *jobz, char *uplo, integer * n, complex *a, integer *lda, complex *b, integer *ldb, real *w, complex *work, integer *lwork, real *rwork, integer *lrwork, integer * iwork, integer *liwork, integer *info) @@ -773,23 +773,25 @@ f"> */ /* Local variables */ integer lopt; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer lwmin; char trans[1]; integer liopt; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; integer lropt; logical wantz; - extern /* Subroutine */ int cheevd_(char *, char *, integer *, complex *, + extern /* Subroutine */ void cheevd_(char *, char *, integer *, complex *, integer *, real *, complex *, integer *, real *, integer *, integer *, integer *, integer *), chegst_(integer *, char *, integer *, complex *, integer *, complex *, integer *, - integer *), xerbla_(char *, integer *, ftnlen), cpotrf_( + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern int cpotrf_( char *, integer *, complex *, integer *, integer *); integer liwmin, lrwmin; logical lquery; @@ -871,15 +873,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -887,7 +889,7 @@ f"> */ cpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -943,7 +945,7 @@ f"> */ rwork[1] = (real) lropt; iwork[1] = liopt; - return 0; + return; /* End of CHEGVD */ diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f index 0c708190ce..4b7f43d52a 100644 --- a/lapack-netlib/SRC/chegvd.f +++ b/lapack-netlib/SRC/chegvd.f @@ -360,9 +360,9 @@ SUBROUTINE CHEGVD( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL CHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, INFO ) - LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) - LROPT = MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) - LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) + LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) ) + LROPT = INT( MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) ) + LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/chegvx.c b/lapack-netlib/SRC/chegvx.c index 24df5668b8..198c480d5e 100644 --- a/lapack-netlib/SRC/chegvx.c +++ b/lapack-netlib/SRC/chegvx.c @@ -820,7 +820,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int chegvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void chegvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, complex *a, integer *lda, complex *b, integer *ldb, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer * m, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, @@ -831,25 +831,27 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); char trans[1]; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper, wantz; integer nb; logical alleig, indeig, valeig; - extern /* Subroutine */ int chegst_(integer *, char *, integer *, complex + extern /* Subroutine */ void chegst_(integer *, char *, integer *, complex *, integer *, complex *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cheevx_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cheevx_( char *, char *, char *, integer *, complex *, integer *, real *, real *, integer *, integer *, real *, integer *, real *, complex * , integer *, complex *, integer *, real *, integer *, integer *, - integer *), cpotrf_(char *, integer *, + integer *); + extern int cpotrf_(char *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; @@ -942,16 +944,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHEGVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -959,7 +961,7 @@ f"> */ cpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -1010,7 +1012,7 @@ f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHEGVX */ diff --git a/lapack-netlib/SRC/cherfs.c b/lapack-netlib/SRC/cherfs.c index 9cb5b2f488..f10c671f89 100644 --- a/lapack-netlib/SRC/cherfs.c +++ b/lapack-netlib/SRC/cherfs.c @@ -705,7 +705,7 @@ f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cherfs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cherfs_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, complex *af, integer *ldaf, integer *ipiv, complex * b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -722,22 +722,23 @@ f"> */ integer i__, j, k; real s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chetrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chetrs_( char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real lstres, eps; @@ -794,7 +795,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHERFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -806,7 +807,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1040,7 +1041,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CHERFS */ diff --git a/lapack-netlib/SRC/chesv.c b/lapack-netlib/SRC/chesv.c index 5d6dfee6ab..8c438db95b 100644 --- a/lapack-netlib/SRC/chesv.c +++ b/lapack-netlib/SRC/chesv.c @@ -684,7 +684,7 @@ static integer c_n1 = -1; /* > \ingroup complexHEsolve */ /* ===================================================================== */ -/* Subroutine */ int chesv_(char *uplo, integer *n, integer *nrhs, complex *a, +/* Subroutine */ void chesv_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -694,16 +694,16 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); integer nb; - extern /* Subroutine */ int chetrf_(char *, integer *, complex *, integer - *, integer *, complex *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void chetrf_(char *, integer *, complex *, integer + *, integer *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int chetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void chetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int chetrs2_(char *, integer *, integer *, + extern /* Subroutine */ void chetrs2_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -760,9 +760,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CHESV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**H or A = L*D*L**H. */ @@ -792,7 +792,7 @@ static integer c_n1 = -1; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHESV */ diff --git a/lapack-netlib/SRC/chesv_aa.c b/lapack-netlib/SRC/chesv_aa.c index 76e3b19013..9401ecb619 100644 --- a/lapack-netlib/SRC/chesv_aa.c +++ b/lapack-netlib/SRC/chesv_aa.c @@ -674,7 +674,7 @@ a.f"> */ /* > \ingroup complexHEsolve */ /* ===================================================================== */ -/* Subroutine */ int chesv_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chesv_aa_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -683,12 +683,13 @@ a.f"> */ /* Local variables */ integer lwkopt_hetrf__, lwkopt_hetrs__; - extern /* Subroutine */ int chetrf_aa_(char *, integer *, complex *, + extern /* Subroutine */ void chetrf_aa_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chetrs_aa_(char *, integer *, integer *, + extern /* Subroutine */ void chetrs_aa_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -749,9 +750,9 @@ a.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHESV_AA ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U**H*T*U or A = L*T*L**H. */ @@ -768,7 +769,7 @@ a.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHESV_AA */ diff --git a/lapack-netlib/SRC/chesv_aa_2stage.c b/lapack-netlib/SRC/chesv_aa_2stage.c index f2849bb28d..31c4127be3 100644 --- a/lapack-netlib/SRC/chesv_aa_2stage.c +++ b/lapack-netlib/SRC/chesv_aa_2stage.c @@ -698,7 +698,7 @@ a_2stage.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int chesv_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chesv_aa_2stage_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *tb, integer *ltb, integer *ipiv, integer *ipiv2, complex *b, integer *ldb, complex *work, integer * lwork, integer *info) @@ -707,7 +707,7 @@ a_2stage.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int chetrs_aa_2stage_(char *, integer *, integer + extern /* Subroutine */ void chetrs_aa_2stage_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); @@ -715,7 +715,7 @@ a_2stage.f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical tquery, wquery; - extern /* Subroutine */ int chetrf_aa_2stage_(char *, integer *, complex + extern /* Subroutine */ void chetrf_aa_2stage_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, integer *); @@ -774,9 +774,9 @@ a_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHESV_AA_2STAGE", &i__1, (ftnlen)15); - return 0; + return; } else if (wquery || tquery) { - return 0; + return; } @@ -795,7 +795,7 @@ a_2stage.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHESV_AA_2STAGE */ diff --git a/lapack-netlib/SRC/chesv_rk.c b/lapack-netlib/SRC/chesv_rk.c index b3219dfb29..f1acea5167 100644 --- a/lapack-netlib/SRC/chesv_rk.c +++ b/lapack-netlib/SRC/chesv_rk.c @@ -740,7 +740,7 @@ k.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chesv_rk_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chesv_rk_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *e, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -749,11 +749,12 @@ k.f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int chetrf_rk_(char *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void chetrf_rk_(char *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int chetrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void chetrs_3_(char *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -811,9 +812,9 @@ k.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHESV_RK ", &i__1,(ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -832,7 +833,7 @@ k.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHESV_RK */ diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f index a659c8e795..e123fa2990 100644 --- a/lapack-netlib/SRC/chesv_rk.f +++ b/lapack-netlib/SRC/chesv_rk.f @@ -280,7 +280,7 @@ SUBROUTINE CHESV_RK( UPLO, N, NRHS, A, LDA, E, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/chesv_rook.c b/lapack-netlib/SRC/chesv_rook.c index 5819d265fe..b3311798c8 100644 --- a/lapack-netlib/SRC/chesv_rook.c +++ b/lapack-netlib/SRC/chesv_rook.c @@ -719,7 +719,7 @@ ook.f"> */ /* ===================================================================== */ -/* Subroutine */ int chesv_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chesv_rook_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -734,7 +734,7 @@ ook.f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int chetrf_rook_(char *, integer *, complex *, + extern /* Subroutine */ void chetrf_rook_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), chetrs_rook_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -792,9 +792,9 @@ ook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHESV_ROOK ", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**H or A = L*D*L**H. */ @@ -813,7 +813,7 @@ ook.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHESV_ROOK */ diff --git a/lapack-netlib/SRC/chesvx.c b/lapack-netlib/SRC/chesvx.c index 0bed0d3e3c..18a299a0d3 100644 --- a/lapack-netlib/SRC/chesvx.c +++ b/lapack-netlib/SRC/chesvx.c @@ -797,7 +797,7 @@ f"> */ /* > \ingroup complexHEsolve */ /* ===================================================================== */ -/* Subroutine */ int chesvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void chesvx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer * ipiv, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, integer *lwork, real *rwork, @@ -813,11 +813,11 @@ f"> */ integer nb; extern real clanhe_(char *, char *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int checon_(char *, integer *, complex *, integer + extern /* Subroutine */ void checon_(char *, integer *, complex *, integer *, integer *, real *, real *, complex *, integer *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int cherfs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cherfs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *), chetrf_(char *, integer *, complex *, integer @@ -826,7 +826,8 @@ f"> */ integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chetrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chetrs_( char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer lwkopt; @@ -909,9 +910,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHESVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } if (nofact) { @@ -926,7 +927,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -960,7 +961,7 @@ f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHESVX */ diff --git a/lapack-netlib/SRC/chesvxx.c b/lapack-netlib/SRC/chesvxx.c index 3f321c3d26..ce97591667 100644 --- a/lapack-netlib/SRC/chesvxx.c +++ b/lapack-netlib/SRC/chesvxx.c @@ -1015,7 +1015,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexHEsolve */ /* ===================================================================== */ -/* Subroutine */ int chesvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void chesvxx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer * ipiv, char *equed, real *s, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, integer * @@ -1036,20 +1036,21 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); real scond; logical equil, rcequ; - extern /* Subroutine */ int claqhe_(char *, integer *, complex *, integer + extern /* Subroutine */ void claqhe_(char *, integer *, complex *, integer *, real *, real *, real *, char *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int chetrf_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetrf_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), clacpy_( char *, integer *, integer *, complex *, integer *, complex *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int chetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void chetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real smlnum; - extern /* Subroutine */ int clascl2_(integer *, integer *, real *, + extern /* Subroutine */ void clascl2_(integer *, integer *, real *, complex *, integer *), cheequb_(char *, integer *, complex *, integer *, real *, real *, real *, complex *, integer *), cherfsx_(char *, char *, integer *, integer *, complex *, integer @@ -1164,7 +1165,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CHESVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1209,7 +1210,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = cla_herpvgrw_(uplo, n, info, &a[a_offset], lda, & af[af_offset], ldaf, &ipiv[1], &rwork[1]); } - return 0; + return; } } @@ -1241,7 +1242,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ clascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of CHESVXX */ diff --git a/lapack-netlib/SRC/cheswapr.c b/lapack-netlib/SRC/cheswapr.c index 2c53be0e3d..9c475e78dc 100644 --- a/lapack-netlib/SRC/cheswapr.c +++ b/lapack-netlib/SRC/cheswapr.c @@ -616,7 +616,7 @@ r.f"> */ /* > \ingroup complexHEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int cheswapr_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void cheswapr_(char *uplo, integer *n, complex *a, integer * lda, integer *i1, integer *i2) { /* System generated locals */ @@ -626,7 +626,7 @@ r.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; complex tmp; @@ -749,6 +749,6 @@ r.f"> */ } } - return 0; + return; } /* cheswapr_ */ diff --git a/lapack-netlib/SRC/chetd2.c b/lapack-netlib/SRC/chetd2.c index cdd845fcf0..b52f5e8d7e 100644 --- a/lapack-netlib/SRC/chetd2.c +++ b/lapack-netlib/SRC/chetd2.c @@ -690,7 +690,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetd2_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void chetd2_(char *uplo, integer *n, complex *a, integer *lda, real *d__, real *e, complex *tau, integer *info) { /* System generated locals */ @@ -700,20 +700,21 @@ f"> */ /* Local variables */ complex taui; - extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex * + extern /* Subroutine */ void cher2_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, integer *); integer i__; complex alpha; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -748,13 +749,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETD2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -909,7 +910,7 @@ f"> */ d__[i__1] = a[i__2].r; } - return 0; + return; /* End of CHETD2 */ diff --git a/lapack-netlib/SRC/chetf2.c b/lapack-netlib/SRC/chetf2.c index c03d84651b..6c186962c4 100644 --- a/lapack-netlib/SRC/chetf2.c +++ b/lapack-netlib/SRC/chetf2.c @@ -700,7 +700,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetf2_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void chetf2_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -709,7 +709,7 @@ f"> */ complex q__1, q__2, q__3, q__4, q__5, q__6; /* Local variables */ - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, integer *, complex *, integer *); integer imax, jmax; real d__; @@ -717,7 +717,7 @@ f"> */ complex t; real alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; logical upper; @@ -731,8 +731,9 @@ f"> */ complex wk; extern integer icamax_(integer *, complex *, integer *); real tt; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real colmax; extern logical sisnan_(real *); real rowmax; @@ -769,7 +770,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETF2", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1352,7 +1353,7 @@ f"> */ } L90: - return 0; + return; /* End of CHETF2 */ diff --git a/lapack-netlib/SRC/chetf2_rk.c b/lapack-netlib/SRC/chetf2_rk.c index 07c75f79e6..2d1754531f 100644 --- a/lapack-netlib/SRC/chetf2_rk.c +++ b/lapack-netlib/SRC/chetf2_rk.c @@ -755,7 +755,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetf2_rk_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetf2_rk_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -764,7 +764,7 @@ rk.f"> */ complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; /* Local variables */ - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, integer *, complex *, integer *); logical done; integer imax, jmax; @@ -774,7 +774,7 @@ rk.f"> */ real alpha; extern logical lsame_(char *, char *); real sfmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp, kstep; real stemp; @@ -790,8 +790,9 @@ rk.f"> */ extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); real tt; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real colmax, rowmax; complex wkm1, wkp1; @@ -828,7 +829,7 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETF2_RK", &i__1, (ftnlen)9); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1837,7 +1838,7 @@ rk.f"> */ ; } - return 0; + return; /* End of CHETF2_RK */ diff --git a/lapack-netlib/SRC/chetf2_rook.c b/lapack-netlib/SRC/chetf2_rook.c index 6be8797452..d623cc5ab7 100644 --- a/lapack-netlib/SRC/chetf2_rook.c +++ b/lapack-netlib/SRC/chetf2_rook.c @@ -708,7 +708,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetf2_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void chetf2_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -717,7 +717,7 @@ rook.f"> */ complex q__1, q__2, q__3, q__4, q__5, q__6, q__7, q__8; /* Local variables */ - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, integer *, complex *, integer *); logical done; integer imax, jmax; @@ -727,7 +727,7 @@ rook.f"> */ real alpha; extern logical lsame_(char *, char *); real sfmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp, kstep; real stemp; @@ -743,8 +743,9 @@ rook.f"> */ extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); real tt; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real colmax, rowmax; complex wkm1, wkp1; @@ -780,7 +781,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETF2_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1683,7 +1684,7 @@ rook.f"> */ L70: - return 0; + return; /* End of CHETF2_ROOK */ diff --git a/lapack-netlib/SRC/chetrd.c b/lapack-netlib/SRC/chetrd.c index 1aa8c1162a..67ff572d23 100644 --- a/lapack-netlib/SRC/chetrd.c +++ b/lapack-netlib/SRC/chetrd.c @@ -709,7 +709,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetrd_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void chetrd_(char *uplo, integer *n, complex *a, integer *lda, real *d__, real *e, complex *tau, complex *work, integer *lwork, integer *info) { @@ -722,14 +722,14 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int chetd2_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetd2_(char *, integer *, complex *, integer *, real *, real *, complex *, integer *), cher2k_(char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, real *, complex *, integer *); integer nb, kk, nx; - extern /* Subroutine */ int clatrd_(char *, integer *, integer *, complex - *, integer *, real *, complex *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clatrd_(char *, integer *, integer *, complex + *, integer *, real *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -784,16 +784,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nx = *n; @@ -930,7 +930,7 @@ f"> */ } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHETRD */ diff --git a/lapack-netlib/SRC/chetrd_2stage.c b/lapack-netlib/SRC/chetrd_2stage.c index 2bdb761fbb..d61672b83a 100644 --- a/lapack-netlib/SRC/chetrd_2stage.c +++ b/lapack-netlib/SRC/chetrd_2stage.c @@ -740,7 +740,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetrd_2stage_(char *vect, char *uplo, integer *n, +/* Subroutine */ void chetrd_2stage_(char *vect, char *uplo, integer *n, complex *a, integer *lda, real *d__, real *e, complex *tau, complex * hous2, integer *lhous2, complex *work, integer *lwork, integer *info) { @@ -749,7 +749,7 @@ static integer c__4 = 4; /* Local variables */ integer ldab; - extern /* Subroutine */ int chetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void chetrd_hb2st_(char *, char *, char *, integer *, integer *, complex *, integer *, real *, real *, complex *, integer *, complex *, integer *, integer *); extern integer ilaenv2stage_(integer *, char *, char *, integer *, @@ -761,7 +761,7 @@ static integer c__4 = 4; integer ib, kd; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int chetrd_he2hb_(char *, integer *, integer *, + extern /* Subroutine */ void chetrd_he2hb_(char *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); @@ -824,16 +824,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Determine pointer position */ @@ -847,20 +847,20 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRD_HE2HB", &i__1, (ftnlen)12); - return 0; + return; } chetrd_hb2st_("Y", vect, uplo, n, &kd, &work[abpos], &ldab, &d__[1], &e[ 1], &hous2[1], lhous2, &work[wpos], &lwrk, info); if (*info != 0) { i__1 = -(*info); xerbla_("CHETRD_HB2ST", &i__1, (ftnlen)12); - return 0; + return; } hous2[1].r = (real) lhmin, hous2[1].i = 0.f; work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHETRD_2STAGE */ diff --git a/lapack-netlib/SRC/chetrd_hb2st.c b/lapack-netlib/SRC/chetrd_hb2st.c index 29eb70ac6e..cbcebf3dd6 100644 --- a/lapack-netlib/SRC/chetrd_hb2st.c +++ b/lapack-netlib/SRC/chetrd_hb2st.c @@ -746,7 +746,7 @@ hb2st.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetrd_hb2st_(char *stage1, char *vect, char *uplo, +/* Subroutine */ void chetrd_hb2st_(char *stage1, char *vect, char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *d__, real * e, complex *hous, integer *lhous, complex *work, integer *lwork, integer *info) @@ -766,17 +766,18 @@ hb2st.f"> */ integer lhmin, sicev, sizea, shift, stind, colpt, lwmin, awpos; logical wantq, upper; integer grsiz, ttype, stepercol, ed, ib; - extern /* Subroutine */ int chb2st_kernels_(char *, logical *, integer *, + extern /* Subroutine */ void chb2st_kernels_(char *, logical *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); integer st, abdpos; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer thgrid, thgrnb, indtau; real abstmp; integer ofdpos, blklastind; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery, afters1; integer lda, tid, ldv; complex tmp; @@ -847,9 +848,9 @@ hb2st.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRD_HB2ST", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -857,7 +858,7 @@ hb2st.f"> */ if (*n == 0) { hous[1].r = 1.f, hous[1].i = 0.f; work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Determine pointer position */ @@ -911,7 +912,7 @@ hb2st.f"> */ hous[1].r = 1.f, hous[1].i = 0.f; work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Case KD=1: */ @@ -992,7 +993,7 @@ hb2st.f"> */ hous[1].r = 1.f, hous[1].i = 0.f; work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Main code start here. */ @@ -1114,7 +1115,7 @@ hb2st.f"> */ hous[1].r = (real) lhmin, hous[1].i = 0.f; work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHETRD_HB2ST */ diff --git a/lapack-netlib/SRC/chetrd_he2hb.c b/lapack-netlib/SRC/chetrd_he2hb.c index 072bcea430..8461cc7784 100644 --- a/lapack-netlib/SRC/chetrd_he2hb.c +++ b/lapack-netlib/SRC/chetrd_he2hb.c @@ -760,7 +760,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetrd_he2hb_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void chetrd_he2hb_(char *uplo, integer *n, integer *kd, complex *a, integer *lda, complex *ab, integer *ldab, complex *tau, complex *work, integer *lwork, integer *info) { @@ -773,29 +773,30 @@ f"> */ extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); integer tpos, wpos, s1pos, s2pos, i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), chemm_(char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer lwmin; logical upper; - extern /* Subroutine */ int cher2k_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cher2k_(char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, real *, complex *, integer *); integer lk, pk, pn, lt; - extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgelqf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer lw; - extern /* Subroutine */ int cgeqrf_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeqrf_(integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clarft_( char *, char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ls1; logical lquery; integer ls2, ldt, ldw, lds1, lds2; @@ -850,10 +851,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRD_HE2HB", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ @@ -882,7 +883,7 @@ f"> */ } } work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Determine the pointer position for the workspace */ @@ -1078,7 +1079,7 @@ f"> */ } work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CHETRD_HE2HB */ diff --git a/lapack-netlib/SRC/chetrf.c b/lapack-netlib/SRC/chetrf.c index c9f2e1c516..0b1a12cf07 100644 --- a/lapack-netlib/SRC/chetrf.c +++ b/lapack-netlib/SRC/chetrf.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chetrf_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void chetrf_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -703,12 +703,13 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int chetf2_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetf2_(char *, integer *, complex *, integer *, integer *, integer *); integer kb, nb; - extern /* Subroutine */ int clahef_(char *, integer *, integer *, integer + extern /* Subroutine */ void clahef_(char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer - *), xerbla_(char *, integer *, ftnlen); + *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -761,9 +762,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -890,7 +891,7 @@ f"> */ L40: work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHETRF */ diff --git a/lapack-netlib/SRC/chetrf_aa.c b/lapack-netlib/SRC/chetrf_aa.c index ff02b316fc..707043942e 100644 --- a/lapack-netlib/SRC/chetrf_aa.c +++ b/lapack-netlib/SRC/chetrf_aa.c @@ -647,7 +647,7 @@ aa.f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetrf_aa_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetrf_aa_(char *uplo, integer *n, complex *a, integer * lda, integer *ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -656,17 +656,17 @@ aa.f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int clahef_aa_(char *, integer *, integer *, + extern /* Subroutine */ void clahef_aa_(char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, complex *); integer j; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; @@ -724,15 +724,15 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRF_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } ipiv[1] = 1; if (*n == 1) { @@ -740,7 +740,7 @@ aa.f"> */ i__2 = a_dim1 + 1; r__1 = a[i__2].r; a[i__1].r = r__1, a[i__1].i = 0.f; - return 0; + return; } /* Adjust block size based on the workspace size */ @@ -1041,7 +1041,7 @@ aa.f"> */ } L20: - return 0; + return; /* End of CHETRF_AA */ diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.c b/lapack-netlib/SRC/chetrf_aa_2stage.c index e7ab409f9c..47de92e10c 100644 --- a/lapack-netlib/SRC/chetrf_aa_2stage.c +++ b/lapack-netlib/SRC/chetrf_aa_2stage.c @@ -675,7 +675,7 @@ aa_2stage.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetrf_aa_2stage_(char *uplo, integer *n, complex *a, +/* Subroutine */ void chetrf_aa_2stage_(char *uplo, integer *n, complex *a, integer *lda, complex *tb, integer *ltb, integer *ipiv, integer * ipiv2, complex *work, integer *lwork, integer *info) { @@ -686,12 +686,12 @@ aa_2stage.f"> */ /* Local variables */ integer ldtb, i__, j, k; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, @@ -699,18 +699,19 @@ aa_2stage.f"> */ integer i1; logical upper; integer i2, jb, kb, nb, td; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); integer nt; - extern /* Subroutine */ int cgbtrf_(integer *, integer *, integer *, - integer *, complex *, integer *, integer *, integer *), cgetrf_( - integer *, integer *, complex *, integer *, integer *, integer *), - clacpy_(char *, integer *, integer *, complex *, integer *, + extern /* Subroutine */ void cgbtrf_(integer *, integer *, integer *, + integer *, complex *, integer *, integer *, integer *); + extern int cgetrf_( + integer *, integer *, complex *, integer *, integer *, integer *); + extern void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer - *, complex *, complex *, complex *, integer *), xerbla_( - char *, integer *, ftnlen); + *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int chegst_(integer *, char *, integer *, complex + extern /* Subroutine */ void chegst_(integer *, char *, integer *, complex *, integer *, complex *, integer *, integer *); logical tquery, wquery; complex piv; @@ -757,7 +758,7 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRF_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Answer the query */ @@ -775,13 +776,13 @@ aa_2stage.f"> */ } } if (tquery || wquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } /* Determine the number of the block size */ @@ -1282,7 +1283,7 @@ aa_2stage.f"> */ /* Factor the band matrix */ cgbtrf_(n, n, &nb, &nb, &tb[1], &ldtb, &ipiv2[1], info); - return 0; + return; /* End of CHETRF_AA_2STAGE */ diff --git a/lapack-netlib/SRC/chetrf_rk.c b/lapack-netlib/SRC/chetrf_rk.c index 614b1e93d1..953c497f2e 100644 --- a/lapack-netlib/SRC/chetrf_rk.c +++ b/lapack-netlib/SRC/chetrf_rk.c @@ -774,7 +774,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetrf_rk_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetrf_rk_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, complex *work, integer *lwork, integer *info) { @@ -782,15 +782,15 @@ rk.f"> */ integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int chetf2_rk_(char *, integer *, complex *, + extern /* Subroutine */ void chetf2_rk_(char *, integer *, complex *, integer *, complex *, integer *, integer *); integer i__, k; - extern /* Subroutine */ int clahef_rk_(char *, integer *, integer *, + extern /* Subroutine */ void clahef_rk_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmin, iinfo; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; integer kb, nb, ip; @@ -848,9 +848,9 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRF_RK", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -1036,7 +1036,7 @@ rk.f"> */ } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHETRF_RK */ diff --git a/lapack-netlib/SRC/chetrf_rook.c b/lapack-netlib/SRC/chetrf_rook.c index 4850a0aff8..5025d7d275 100644 --- a/lapack-netlib/SRC/chetrf_rook.c +++ b/lapack-netlib/SRC/chetrf_rook.c @@ -728,7 +728,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetrf_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void chetrf_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *lwork, integer * info) { @@ -746,10 +746,10 @@ rook.f"> */ integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; logical lquery; - extern /* Subroutine */ int chetf2_rook_(char *, integer *, complex *, + extern /* Subroutine */ void chetf2_rook_(char *, integer *, complex *, integer *, integer *, integer *); integer iws; - extern /* Subroutine */ int clahef_rook_(char *, integer *, integer *, + extern /* Subroutine */ void clahef_rook_(char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -801,9 +801,9 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRF_ROOK", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -933,7 +933,7 @@ rook.f"> */ L40: work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHETRF_ROOK */ diff --git a/lapack-netlib/SRC/chetri.c b/lapack-netlib/SRC/chetri.c index bd3bf7a6d5..01a8b298e1 100644 --- a/lapack-netlib/SRC/chetri.c +++ b/lapack-netlib/SRC/chetri.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetri_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void chetri_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *info) { /* System generated locals */ @@ -644,7 +644,7 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, @@ -688,13 +688,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -706,7 +706,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -718,7 +718,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -1051,7 +1051,7 @@ f"> */ ; } - return 0; + return; /* End of CHETRI */ diff --git a/lapack-netlib/SRC/chetri2.c b/lapack-netlib/SRC/chetri2.c index dc26121d97..529a24fc71 100644 --- a/lapack-netlib/SRC/chetri2.c +++ b/lapack-netlib/SRC/chetri2.c @@ -641,14 +641,14 @@ static integer c_n1 = -1; /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetri2_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetri2_(char *uplo, integer *n, complex *a, integer * lda, integer *ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int chetri2x_(char *, integer *, complex *, + extern /* Subroutine */ void chetri2x_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmax; @@ -656,7 +656,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int chetri_(char *, integer *, complex *, integer + extern /* Subroutine */ void chetri_(char *, integer *, complex *, integer *, integer *, complex *, integer *); logical lquery; integer minsize; @@ -709,13 +709,13 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRI2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { work[1].r = (real) minsize, work[1].i = 0.f; - return 0; + return; } if (*n == 0) { - return 0; + return; } if (nbmax >= *n) { chetri_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], info); @@ -723,7 +723,7 @@ static integer c_n1 = -1; chetri2x_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], &nbmax, info); } - return 0; + return; /* End of CHETRI2 */ diff --git a/lapack-netlib/SRC/chetri2x.c b/lapack-netlib/SRC/chetri2x.c index 9daebcf4e2..00869feade 100644 --- a/lapack-netlib/SRC/chetri2x.c +++ b/lapack-netlib/SRC/chetri2x.c @@ -634,7 +634,7 @@ x.f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetri2x_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetri2x_(char *uplo, integer *n, complex *a, integer * lda, integer *ipiv, complex *work, integer *nb, integer *info) { /* System generated locals */ @@ -645,17 +645,17 @@ x.f"> */ /* Local variables */ integer invd; - extern /* Subroutine */ int cheswapr_(char *, integer *, complex *, + extern /* Subroutine */ void cheswapr_(char *, integer *, complex *, integer *, integer *, integer *); complex akkp1, d__; integer i__, j, k; complex t; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer count; @@ -664,11 +664,12 @@ x.f"> */ integer u11; complex u11_i_j__; integer ip; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ctrtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int ctrtri_( char *, char *, integer *, complex *, integer *, integer *); integer nnb, cut; complex akp1; - extern /* Subroutine */ int csyconv_(char *, char *, integer *, complex *, + extern /* Subroutine */ void csyconv_(char *, char *, integer *, complex *, integer *, integer *, complex *, integer *); complex u01_ip1_j__, u11_ip1_j__; @@ -710,10 +711,10 @@ x.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRI2X", &i__1, (ftnlen)8); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Convert A */ @@ -731,7 +732,7 @@ x.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } } } else { @@ -742,7 +743,7 @@ x.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } } } @@ -1385,7 +1386,7 @@ x.f"> */ } } - return 0; + return; /* End of CHETRI2X */ diff --git a/lapack-netlib/SRC/chetri_3.c b/lapack-netlib/SRC/chetri_3.c index 4b63cc99f5..f174298e46 100644 --- a/lapack-netlib/SRC/chetri_3.c +++ b/lapack-netlib/SRC/chetri_3.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetri_3_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetri_3_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, complex *work, integer *lwork, integer *info) { @@ -691,7 +691,7 @@ static integer c_n1 = -1; integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int chetri_3x_(char *, integer *, complex *, + extern /* Subroutine */ void chetri_3x_(char *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); logical upper; @@ -748,16 +748,16 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRI_3", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } chetri_3x_(uplo, n, &a[a_offset], lda, &e[1], &ipiv[1], &work[1], &nb, @@ -765,7 +765,7 @@ static integer c_n1 = -1; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CHETRI_3 */ diff --git a/lapack-netlib/SRC/chetri_3x.c b/lapack-netlib/SRC/chetri_3x.c index 4f061ba10a..145dcccef6 100644 --- a/lapack-netlib/SRC/chetri_3x.c +++ b/lapack-netlib/SRC/chetri_3x.c @@ -673,7 +673,7 @@ static complex c_b2 = {0.f,0.f}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetri_3x_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void chetri_3x_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, complex *work, integer *nb, integer * info) { @@ -685,16 +685,16 @@ static complex c_b2 = {0.f,0.f}; /* Local variables */ integer invd; - extern /* Subroutine */ int cheswapr_(char *, integer *, complex *, + extern /* Subroutine */ void cheswapr_(char *, integer *, complex *, integer *, integer *, integer *); complex akkp1, d__; integer i__, j, k; real t; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; @@ -749,10 +749,10 @@ static complex c_b2 = {0.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRI_3X", &i__1, (ftnlen)9); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Workspace got Non-diag elements of D */ @@ -773,7 +773,7 @@ static complex c_b2 = {0.f,0.f}; for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } } } else { @@ -784,7 +784,7 @@ static complex c_b2 = {0.f,0.f}; for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } } } @@ -1426,7 +1426,7 @@ static complex c_b2 = {0.f,0.f}; } - return 0; + return; /* End of CHETRI_3X */ diff --git a/lapack-netlib/SRC/chetri_rook.c b/lapack-netlib/SRC/chetri_rook.c index fd209056a5..ae64d562a4 100644 --- a/lapack-netlib/SRC/chetri_rook.c +++ b/lapack-netlib/SRC/chetri_rook.c @@ -643,7 +643,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetri_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void chetri_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *info) { /* System generated locals */ @@ -659,7 +659,7 @@ rook.f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, @@ -703,13 +703,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRI_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -721,7 +721,7 @@ rook.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -733,7 +733,7 @@ rook.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -1232,7 +1232,7 @@ rook.f"> */ ; } - return 0; + return; /* End of CHETRI_ROOK */ diff --git a/lapack-netlib/SRC/chetrs.c b/lapack-netlib/SRC/chetrs.c index 1e17a5e746..e8f259cce3 100644 --- a/lapack-netlib/SRC/chetrs.c +++ b/lapack-netlib/SRC/chetrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetrs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void chetrs_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, integer *ipiv, complex *b, integer *ldb, integer * info) { @@ -648,7 +648,7 @@ f"> */ real s; extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -656,9 +656,9 @@ f"> */ logical upper; complex ak, bk; integer kp; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); complex akm1, bkm1; @@ -697,13 +697,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1076,7 +1076,7 @@ f"> */ ; } - return 0; + return; /* End of CHETRS */ diff --git a/lapack-netlib/SRC/chetrs2.c b/lapack-netlib/SRC/chetrs2.c index b2be27b6b5..854cfd69c1 100644 --- a/lapack-netlib/SRC/chetrs2.c +++ b/lapack-netlib/SRC/chetrs2.c @@ -639,7 +639,7 @@ static complex c_b1 = {1.f,0.f}; /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetrs2_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void chetrs2_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex * work, integer *info) { @@ -654,17 +654,18 @@ static complex c_b1 = {1.f,0.f}; extern logical lsame_(char *, char *); complex denom; integer iinfo; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; complex ak, bk; integer kp; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); complex akm1, bkm1; - extern /* Subroutine */ int csyconv_(char *, char *, integer *, complex *, + extern /* Subroutine */ void csyconv_(char *, char *, integer *, complex *, integer *, integer *, complex *, integer *); @@ -704,13 +705,13 @@ static complex c_b1 = {1.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRS2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Convert A */ @@ -937,7 +938,7 @@ static complex c_b1 = {1.f,0.f}; csyconv_(uplo, "R", n, &a[a_offset], lda, &ipiv[1], &work[1], &iinfo); - return 0; + return; /* End of CHETRS2 */ diff --git a/lapack-netlib/SRC/chetrs_3.c b/lapack-netlib/SRC/chetrs_3.c index 7a994c0414..9e46d445cd 100644 --- a/lapack-netlib/SRC/chetrs_3.c +++ b/lapack-netlib/SRC/chetrs_3.c @@ -676,7 +676,7 @@ static complex c_b1 = {1.f,0.f}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetrs_3_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chetrs_3_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *e, integer *ipiv, complex *b, integer *ldb, integer *info) { @@ -690,15 +690,16 @@ static complex c_b1 = {1.f,0.f}; real s; extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; complex ak, bk; integer kp; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); complex akm1, bkm1; @@ -738,13 +739,13 @@ static complex c_b1 = {1.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CHETRS_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -938,7 +939,7 @@ static complex c_b1 = {1.f,0.f}; } - return 0; + return; /* End of CHETRS_3 */ diff --git a/lapack-netlib/SRC/chetrs_aa.c b/lapack-netlib/SRC/chetrs_aa.c index 69819ccdf5..464bc96672 100644 --- a/lapack-netlib/SRC/chetrs_aa.c +++ b/lapack-netlib/SRC/chetrs_aa.c @@ -644,7 +644,7 @@ aa.f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetrs_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chetrs_aa_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -654,16 +654,17 @@ aa.f"> */ /* Local variables */ integer k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), cgtsv_(integer *, integer *, complex *, complex *, complex *, complex *, integer *, integer *), ctrsm_( char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; integer kp; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, - complex *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -712,17 +713,17 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRS_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { lwkopt = *n * 3 - 2; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -864,7 +865,7 @@ aa.f"> */ } - return 0; + return; /* End of CHETRS_AA */ diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.c b/lapack-netlib/SRC/chetrs_aa_2stage.c index ff2693e5f8..d4cca77e5f 100644 --- a/lapack-netlib/SRC/chetrs_aa_2stage.c +++ b/lapack-netlib/SRC/chetrs_aa_2stage.c @@ -655,7 +655,7 @@ aa_2stage.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int chetrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chetrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *tb, integer *ltb, integer *ipiv, integer *ipiv2, complex *b, integer *ldb, integer *info) { @@ -665,15 +665,16 @@ aa_2stage.f"> */ /* Local variables */ integer ldtb; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; integer nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgbtrs_( char *, integer *, integer *, integer *, integer *, complex *, - integer *, integer *, complex *, integer *, integer *), - claswp_(integer *, complex *, integer *, integer *, integer *, + integer *, integer *, complex *, integer *, integer *); + extern int claswp_(integer *, complex *, integer *, integer *, integer *, integer *, integer *); @@ -717,13 +718,13 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRS_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Read NB and compute LDTB */ @@ -808,7 +809,7 @@ aa_2stage.f"> */ } } - return 0; + return; /* End of CHETRS_AA_2STAGE */ diff --git a/lapack-netlib/SRC/chetrs_rook.c b/lapack-netlib/SRC/chetrs_rook.c index 936ee3c636..f2c4c30c0e 100644 --- a/lapack-netlib/SRC/chetrs_rook.c +++ b/lapack-netlib/SRC/chetrs_rook.c @@ -650,7 +650,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int chetrs_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void chetrs_rook_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, integer *info) { @@ -664,7 +664,7 @@ rook.f"> */ real s; extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -672,9 +672,9 @@ rook.f"> */ logical upper; complex ak, bk; integer kp; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); complex akm1, bkm1; @@ -713,13 +713,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHETRS_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1114,7 +1114,7 @@ rook.f"> */ ; } - return 0; + return; /* End of CHETRS_ROOK */ diff --git a/lapack-netlib/SRC/chfrk.c b/lapack-netlib/SRC/chfrk.c index 5e0eca2825..3604d0566c 100644 --- a/lapack-netlib/SRC/chfrk.c +++ b/lapack-netlib/SRC/chfrk.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chfrk_(char *transr, char *uplo, char *trans, integer *n, +/* Subroutine */ void chfrk_(char *transr, char *uplo, char *trans, integer *n, integer *k, real *alpha, complex *a, integer *lda, real *beta, complex *c__) { @@ -688,7 +688,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer info, j; complex cbeta; logical normaltransr; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real * @@ -749,7 +749,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (info != 0) { i__1 = -info; xerbla_("CHFRK ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ @@ -758,7 +758,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* done (it is in CHERK for example) and left in the general case. */ if (*n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) { - return 0; + return; } if (*alpha == 0.f && *beta == 0.f) { @@ -767,7 +767,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ i__2 = j; c__[i__2].r = 0.f, c__[i__2].i = 0.f; } - return 0; + return; } q__1.r = *alpha, q__1.i = 0.f; @@ -1080,7 +1080,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of CHFRK */ diff --git a/lapack-netlib/SRC/chgeqz.c b/lapack-netlib/SRC/chgeqz.c index cb738aa412..2781ce6302 100644 --- a/lapack-netlib/SRC/chgeqz.c +++ b/lapack-netlib/SRC/chgeqz.c @@ -796,7 +796,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chgeqz_(char *job, char *compq, char *compz, integer *n, +/* Subroutine */ void chgeqz_(char *job, char *compq, char *compz, integer *n, integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *t, integer *ldt, complex *alpha, complex *beta, complex *q, integer *ldq, complex *z__, integer *ldz, complex *work, integer *lwork, real * @@ -810,12 +810,12 @@ f"> */ /* Local variables */ real absb, atol, btol, temp; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); real temp2, c__; integer j; complex s; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); complex x, y; extern logical lsame_(char *, char *); @@ -835,7 +835,7 @@ f"> */ complex signbc; extern real slamch_(char *), clanhs_(char *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, complex *, real *, complex *, complex *); real safmin; @@ -954,9 +954,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHGEQZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -964,7 +964,7 @@ f"> */ /* WORK( 1 ) = CMPLX( 1 ) */ if (*n <= 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Initialize Q and Z */ @@ -1742,7 +1742,7 @@ f"> */ L210: q__1.r = (real) (*n), q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CHGEQZ */ diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 8c1d62a875..50c6827ff9 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -523,9 +523,7 @@ SUBROUTINE CHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = CZERO GO TO 50 END IF @@ -551,10 +549,7 @@ SUBROUTINE CHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = CZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/chpcon.c b/lapack-netlib/SRC/chpcon.c index 13aee7d57c..54f4a8159b 100644 --- a/lapack-netlib/SRC/chpcon.c +++ b/lapack-netlib/SRC/chpcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chpcon_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void chpcon_(char *uplo, integer *n, complex *ap, integer * ipiv, real *anorm, real *rcond, complex *work, integer *info) { /* System generated locals */ @@ -642,12 +642,12 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int chptrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void chptrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -680,7 +680,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -688,9 +688,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -703,7 +703,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = ip; if (ipiv[i__] > 0 && (ap[i__1].r == 0.f && ap[i__1].i == 0.f)) { - return 0; + return; } ip -= i__; /* L10: */ @@ -717,7 +717,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = ip; if (ipiv[i__] > 0 && (ap[i__2].r == 0.f && ap[i__2].i == 0.f)) { - return 0; + return; } ip = ip + *n - i__ + 1; /* L20: */ @@ -743,7 +743,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CHPCON */ diff --git a/lapack-netlib/SRC/chpev.c b/lapack-netlib/SRC/chpev.c index b812d29a4a..2a704ec064 100644 --- a/lapack-netlib/SRC/chpev.c +++ b/lapack-netlib/SRC/chpev.c @@ -651,7 +651,7 @@ atrices */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chpev_(char *jobz, char *uplo, integer *n, complex *ap, +/* Subroutine */ void chpev_(char *jobz, char *uplo, integer *n, complex *ap, real *w, complex *z__, integer *ldz, complex *work, real *rwork, integer *info) { @@ -666,20 +666,20 @@ atrices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical wantz; integer iscale; extern real clanhp_(char *, char *, integer *, complex *, real *), slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau; - extern /* Subroutine */ int chptrd_(char *, integer *, complex *, real *, + extern /* Subroutine */ void chptrd_(char *, integer *, complex *, real *, real *, complex *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), cupgtr_(char *, integer *, complex *, complex *, complex *, integer *, complex *, integer *), ssterf_(integer *, real *, real *, integer *); @@ -724,13 +724,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -740,7 +740,7 @@ atrices */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -800,7 +800,7 @@ atrices */ sscal_(&imax, &r__1, &w[1], &c__1); } - return 0; + return; /* End of CHPEV */ diff --git a/lapack-netlib/SRC/chpevd.c b/lapack-netlib/SRC/chpevd.c index 3e3ffc5835..9e5224da07 100644 --- a/lapack-netlib/SRC/chpevd.c +++ b/lapack-netlib/SRC/chpevd.c @@ -713,7 +713,7 @@ f"> */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chpevd_(char *jobz, char *uplo, integer *n, complex *ap, +/* Subroutine */ void chpevd_(char *jobz, char *uplo, integer *n, complex *ap, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -729,27 +729,27 @@ f"> */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lwmin, llrwk, llwrk; logical wantz; integer iscale; extern real clanhp_(char *, char *, integer *, complex *, real *); - extern /* Subroutine */ int cstedc_(char *, integer *, real *, real *, + extern /* Subroutine */ void cstedc_(char *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *, integer *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau; - extern /* Subroutine */ int chptrd_(char *, integer *, complex *, real *, + extern /* Subroutine */ void chptrd_(char *, integer *, complex *, real *, real *, complex *, integer *); integer indrwk, indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin; - extern /* Subroutine */ int cupmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cupmtr_(char *, char *, char *, integer *, integer *, complex *, complex *, complex *, integer *, complex *, integer *); real smlnum; @@ -828,15 +828,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPEVD", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -845,7 +845,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -911,7 +911,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHPEVD */ diff --git a/lapack-netlib/SRC/chpevx.c b/lapack-netlib/SRC/chpevx.c index a7f92e1df8..8435471914 100644 --- a/lapack-netlib/SRC/chpevx.c +++ b/lapack-netlib/SRC/chpevx.c @@ -752,7 +752,7 @@ f"> */ /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chpevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void chpevx_(char *jobz, char *range, char *uplo, integer *n, complex *ap, real *vl, real *vu, integer *il, integer *iu, real * abstol, integer *m, real *w, complex *z__, integer *ldz, complex * work, real *rwork, integer *iwork, integer *ifail, integer *info) @@ -771,9 +771,9 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), scopy_(integer *, real *, integer *, real * , integer *); logical wantz; @@ -783,27 +783,27 @@ f"> */ extern real clanhp_(char *, char *, integer *, complex *, real *); logical valeig; extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indiwk, indisp, indtau; - extern /* Subroutine */ int chptrd_(char *, integer *, complex *, real *, + extern /* Subroutine */ void chptrd_(char *, integer *, complex *, real *, real *, complex *, integer *), cstein_(integer *, real *, real *, integer *, real *, integer *, integer *, complex *, integer *, real *, integer *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *), cupgtr_(char *, integer *, complex *, complex *, complex *, integer *, complex *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int cupmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void cupmtr_(char *, char *, char *, integer *, integer *, complex *, complex *, complex *, integer *, complex *, integer *); real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -870,14 +870,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -894,7 +894,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Get machine constants. */ @@ -1063,7 +1063,7 @@ f"> */ } } - return 0; + return; /* End of CHPEVX */ diff --git a/lapack-netlib/SRC/chpevx.f b/lapack-netlib/SRC/chpevx.f index a5af973a73..1f602701a0 100644 --- a/lapack-netlib/SRC/chpevx.f +++ b/lapack-netlib/SRC/chpevx.f @@ -264,7 +264,7 @@ SUBROUTINE CHPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, VALEIG, WANTZ CHARACTER ORDER - INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE, $ ITMP1, J, JJ, NSPLIT REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, @@ -434,17 +434,16 @@ SUBROUTINE CHPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWK = INDISP + N CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( 1 ), IWORK( INDISP ), RWORK( INDRWK ), $ IWORK( INDIWK ), INFO ) * IF( WANTZ ) THEN CALL CSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) * * Apply unitary matrix used in reduction to tridiagonal @@ -482,11 +481,11 @@ SUBROUTINE CHPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, 30 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL CSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/chpgst.c b/lapack-netlib/SRC/chpgst.c index 923b0b10b7..30898fc849 100644 --- a/lapack-netlib/SRC/chpgst.c +++ b/lapack-netlib/SRC/chpgst.c @@ -627,7 +627,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chpgst_(integer *itype, char *uplo, integer *n, complex * +/* Subroutine */ void chpgst_(integer *itype, char *uplo, integer *n, complex * ap, complex *bp, integer *info) { /* System generated locals */ @@ -636,25 +636,26 @@ f"> */ complex q__1, q__2, q__3; /* Local variables */ - extern /* Subroutine */ int chpr2_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chpr2_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *); integer j, k; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chpmv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chpmv_(char *, integer *, complex *, complex * , complex *, integer *, complex *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; integer j1, k1; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *); integer jj, kk; complex ct; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real ajj; integer j1j1; real akk; @@ -690,7 +691,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPGST", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -854,7 +855,7 @@ f"> */ } } } - return 0; + return; /* End of CHPGST */ diff --git a/lapack-netlib/SRC/chpgv.c b/lapack-netlib/SRC/chpgv.c index a44d81e506..9028fa5d67 100644 --- a/lapack-netlib/SRC/chpgv.c +++ b/lapack-netlib/SRC/chpgv.c @@ -677,7 +677,7 @@ static integer c__1 = 1; /* > \ingroup complexOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int chpgv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void chpgv_(integer *itype, char *jobz, char *uplo, integer * n, complex *ap, complex *bp, real *w, complex *z__, integer *ldz, complex *work, real *rwork, integer *info) { @@ -687,16 +687,17 @@ static integer c__1 = 1; /* Local variables */ integer neig, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int chpev_(char *, char *, integer *, complex *, + extern /* Subroutine */ void chpev_(char *, char *, integer *, complex *, real *, complex *, integer *, complex *, real *, integer *); char trans[1]; - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical wantz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chpgst_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chpgst_( integer *, char *, integer *, complex *, complex *, integer *), cpptrf_(char *, integer *, complex *, integer *); @@ -740,13 +741,13 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CHPGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -754,7 +755,7 @@ static integer c__1 = 1; cpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -808,7 +809,7 @@ static integer c__1 = 1; } } } - return 0; + return; /* End of CHPGV */ diff --git a/lapack-netlib/SRC/chpgvd.c b/lapack-netlib/SRC/chpgvd.c index c60eabcc32..18e1100dfd 100644 --- a/lapack-netlib/SRC/chpgvd.c +++ b/lapack-netlib/SRC/chpgvd.c @@ -743,7 +743,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int chpgvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void chpgvd_(integer *itype, char *jobz, char *uplo, integer * n, complex *ap, complex *bp, real *w, complex *z__, integer *ldz, complex *work, integer *lwork, real *rwork, integer *lrwork, integer * iwork, integer *liwork, integer *info) @@ -757,16 +757,17 @@ f"> */ extern logical lsame_(char *, char *); integer lwmin; char trans[1]; - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical wantz; - extern /* Subroutine */ int chpevd_(char *, char *, integer *, complex *, + extern /* Subroutine */ void chpevd_(char *, char *, integer *, complex *, real *, complex *, integer *, complex *, integer *, real *, - integer *, integer *, integer *, integer *), - xerbla_(char *, integer *, ftnlen), chpgst_(integer *, char *, + integer *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void chpgst_(integer *, char *, integer *, complex *, complex *, integer *), cpptrf_(char *, integer *, complex *, integer *); integer liwmin, lrwmin; @@ -847,15 +848,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -863,7 +864,7 @@ f"> */ cpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -930,7 +931,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CHPGVD */ diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f index 754be31ed3..65d08b7832 100644 --- a/lapack-netlib/SRC/chpgvd.f +++ b/lapack-netlib/SRC/chpgvd.f @@ -335,9 +335,9 @@ SUBROUTINE CHPGVD( ITYPE, JOBZ, UPLO, N, AP, BP, W, Z, LDZ, WORK, CALL CHPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, INFO ) - LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) - LRWMIN = MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) - LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) + LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) ) + LRWMIN = INT( MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) ) + LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/chpgvx.c b/lapack-netlib/SRC/chpgvx.c index be34f9af6a..da649dd47f 100644 --- a/lapack-netlib/SRC/chpgvx.c +++ b/lapack-netlib/SRC/chpgvx.c @@ -788,7 +788,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int chpgvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void chpgvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, complex *ap, complex *bp, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, complex * z__, integer *ldz, complex *work, real *rwork, integer *iwork, @@ -801,13 +801,14 @@ f"> */ integer j; extern logical lsame_(char *, char *); char trans[1]; - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical wantz, alleig, indeig, valeig; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chpgst_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chpgst_( integer *, char *, integer *, complex *, complex *, integer *), chpevx_(char *, char *, char *, integer *, complex *, real *, real *, integer *, integer *, real *, integer *, real *, complex *, integer *, complex *, real *, integer *, integer *, @@ -878,13 +879,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -892,7 +893,7 @@ f"> */ cpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -947,7 +948,7 @@ f"> */ } } - return 0; + return; /* End of CHPGVX */ diff --git a/lapack-netlib/SRC/chprfs.c b/lapack-netlib/SRC/chprfs.c index d127189ee5..612976b281 100644 --- a/lapack-netlib/SRC/chprfs.c +++ b/lapack-netlib/SRC/chprfs.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chprfs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void chprfs_(char *uplo, integer *n, integer *nrhs, complex * ap, complex *afp, integer *ipiv, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -710,20 +710,21 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), chpmv_(char *, integer *, complex *, complex *, complex *, integer *, complex *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ik, kk; real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chptrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chptrs_( char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real lstres, eps; @@ -772,7 +773,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -784,7 +785,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1022,7 +1023,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CHPRFS */ diff --git a/lapack-netlib/SRC/chpsv.c b/lapack-netlib/SRC/chpsv.c index a9cadb4990..ee137647c7 100644 --- a/lapack-netlib/SRC/chpsv.c +++ b/lapack-netlib/SRC/chpsv.c @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chpsv_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void chpsv_(char *uplo, integer *n, integer *nrhs, complex * ap, integer *ipiv, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -679,7 +679,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chptrf_( char *, integer *, complex *, integer *, integer *), chptrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -717,7 +718,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CHPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the factorization A = U*D*U**H or A = L*D*L**H. */ @@ -730,7 +731,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ chptrs_(uplo, n, nrhs, &ap[1], &ipiv[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of CHPSV */ diff --git a/lapack-netlib/SRC/chpsvx.c b/lapack-netlib/SRC/chpsvx.c index 5ac0ef5011..b6377469ba 100644 --- a/lapack-netlib/SRC/chpsvx.c +++ b/lapack-netlib/SRC/chpsvx.c @@ -789,7 +789,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chpsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void chpsvx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *ap, complex *afp, integer *ipiv, complex *b, integer * ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -800,13 +800,15 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern real clanhp_(char *, char *, integer *, complex *, real *), slamch_(char *); logical nofact; - extern /* Subroutine */ int chpcon_(char *, integer *, complex *, integer + extern /* Subroutine */ void chpcon_(char *, integer *, complex *, integer *, real *, real *, complex *, integer *), clacpy_(char *, - integer *, integer *, complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen), chprfs_(char *, + integer *, integer *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void chprfs_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real * , integer *), chptrf_(char *, integer *, complex *, @@ -860,7 +862,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -875,7 +877,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -904,7 +906,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CHPSVX */ diff --git a/lapack-netlib/SRC/chptrd.c b/lapack-netlib/SRC/chptrd.c index 1796bdd879..8a42cd4d1f 100644 --- a/lapack-netlib/SRC/chptrd.c +++ b/lapack-netlib/SRC/chptrd.c @@ -665,7 +665,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chptrd_(char *uplo, integer *n, complex *ap, real *d__, +/* Subroutine */ void chptrd_(char *uplo, integer *n, complex *ap, real *d__, real *e, complex *tau, integer *info) { /* System generated locals */ @@ -675,22 +675,23 @@ f"> */ /* Local variables */ complex taui; - extern /* Subroutine */ int chpr2_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chpr2_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *); integer i__; complex alpha; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chpmv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chpmv_(char *, integer *, complex *, complex * , complex *, integer *, complex *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer i1; logical upper; integer ii; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); integer i1i1; @@ -722,13 +723,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -868,7 +869,7 @@ f"> */ d__[i__1] = ap[i__2].r; } - return 0; + return; /* End of CHPTRD */ diff --git a/lapack-netlib/SRC/chptrf.c b/lapack-netlib/SRC/chptrf.c index 431722909a..8e0c5f1662 100644 --- a/lapack-netlib/SRC/chptrf.c +++ b/lapack-netlib/SRC/chptrf.c @@ -672,7 +672,7 @@ f"> */ /* > J. Lewis, Boeing Computer Services Company */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chptrf_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void chptrf_(char *uplo, integer *n, complex *ap, integer * ipiv, integer *info) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ complex q__1, q__2, q__3, q__4, q__5, q__6; /* Local variables */ - extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, + extern /* Subroutine */ void chpr_(char *, integer *, real *, complex *, integer *, complex *); integer imax, jmax; real d__; @@ -689,7 +689,7 @@ f"> */ complex t; real alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; logical upper; @@ -704,8 +704,9 @@ f"> */ integer kx; extern integer icamax_(integer *, complex *, integer *); real tt; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real colmax, rowmax; integer knc, kpc, npp; complex wkm1, wkp1; @@ -737,7 +738,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1355,7 +1356,7 @@ f"> */ } L110: - return 0; + return; /* End of CHPTRF */ diff --git a/lapack-netlib/SRC/chptri.c b/lapack-netlib/SRC/chptri.c index b83affabcd..334da92b1c 100644 --- a/lapack-netlib/SRC/chptri.c +++ b/lapack-netlib/SRC/chptri.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chptri_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void chptri_(char *uplo, integer *n, complex *ap, integer * ipiv, complex *work, integer *info) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), chpmv_(char *, integer *, complex *, complex *, complex *, integer *, complex *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *); @@ -679,13 +679,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -698,7 +698,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = kp; if (ipiv[*info] > 0 && (ap[i__1].r == 0.f && ap[i__1].i == 0.f)) { - return 0; + return; } kp -= *info; /* L10: */ @@ -712,7 +712,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = kp; if (ipiv[*info] > 0 && (ap[i__2].r == 0.f && ap[i__2].i == 0.f)) { - return 0; + return; } kp = kp + *n - *info + 1; /* L20: */ @@ -1050,7 +1050,7 @@ f"> */ ; } - return 0; + return; /* End of CHPTRI */ diff --git a/lapack-netlib/SRC/chptrs.c b/lapack-netlib/SRC/chptrs.c index 89a977fdee..941362df1c 100644 --- a/lapack-netlib/SRC/chptrs.c +++ b/lapack-netlib/SRC/chptrs.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int chptrs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void chptrs_(char *uplo, integer *n, integer *nrhs, complex * ap, integer *ipiv, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ real s; extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -650,9 +650,9 @@ f"> */ logical upper; complex ak, bk; integer kc, kp; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); complex akm1, bkm1; @@ -687,13 +687,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1075,7 +1075,7 @@ f"> */ ; } - return 0; + return; /* End of CHPTRS */ diff --git a/lapack-netlib/SRC/chsein.c b/lapack-netlib/SRC/chsein.c index 3fddc94cd1..eeaaffb0cd 100644 --- a/lapack-netlib/SRC/chsein.c +++ b/lapack-netlib/SRC/chsein.c @@ -757,7 +757,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int chsein_(char *side, char *eigsrc, char *initv, logical * +/* Subroutine */ void chsein_(char *side, char *eigsrc, char *initv, logical * select, integer *n, complex *h__, integer *ldh, complex *w, complex * vl, integer *ldvl, complex *vr, integer *ldvr, integer *mm, integer * m, complex *work, real *rwork, integer *ifaill, integer *ifailr, @@ -777,7 +777,7 @@ f"> */ logical leftv, bothv; real hnorm; integer kl; - extern /* Subroutine */ int claein_(logical *, logical *, integer *, + extern /* Subroutine */ void claein_(logical *, logical *, integer *, complex *, integer *, complex *, complex *, complex *, integer *, real *, real *, real *, integer *); integer kr, ks; @@ -864,13 +864,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CHSEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set machine-dependent constants. */ @@ -944,7 +944,7 @@ f"> */ rwork[1]); if (sisnan_(&hnorm)) { *info = -6; - return 0; + return; } else if (hnorm > 0.f) { eps3 = hnorm * ulp; } else { @@ -1021,7 +1021,7 @@ f"> */ /* L100: */ } - return 0; + return; /* End of CHSEIN */ diff --git a/lapack-netlib/SRC/chseqr.c b/lapack-netlib/SRC/chseqr.c index bb8001be43..002313ba2a 100644 --- a/lapack-netlib/SRC/chseqr.c +++ b/lapack-netlib/SRC/chseqr.c @@ -816,7 +816,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* ===================================================================== */ -/* Subroutine */ int chseqr_(char *job, char *compz, integer *n, integer *ilo, +/* Subroutine */ void chseqr_(char *job, char *compz, integer *n, integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w, complex *z__, integer *ldz, complex *work, integer *lwork, integer *info) { @@ -830,20 +830,21 @@ f"> */ /* Local variables */ integer kbot, nmin; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical initz; complex workl[49]; logical wantt, wantz; - extern /* Subroutine */ int claqr0_(logical *, logical *, integer *, + extern /* Subroutine */ void claqr0_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); complex hl[2401] /* was [49][49] */; - extern /* Subroutine */ int clahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void clahqr_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex - *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical lquery; @@ -915,13 +916,13 @@ f"> */ i__1 = -(*info); xerbla_("CHSEQR", &i__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { /* ==== Quick return in case N = 0; nothing to do. ==== */ - return 0; + return; } else if (lquery) { @@ -936,7 +937,7 @@ f"> */ r__1 = f2cmax(r__2,r__3); q__1.r = r__1, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } else { @@ -966,7 +967,7 @@ f"> */ i__1 = *ilo; i__2 = *ilo + *ilo * h_dim1; w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i; - return 0; + return; } /* ==== CLAHQR/CLAQR0 crossover point ==== */ @@ -1050,6 +1051,6 @@ f"> */ /* ==== End of CHSEQR ==== */ - return 0; + return; } /* chseqr_ */ diff --git a/lapack-netlib/SRC/cla_gbamv.c b/lapack-netlib/SRC/cla_gbamv.c index e8a49ab261..a5978c82d4 100644 --- a/lapack-netlib/SRC/cla_gbamv.c +++ b/lapack-netlib/SRC/cla_gbamv.c @@ -694,7 +694,7 @@ mv.f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_gbamv_(integer *trans, integer *m, integer *n, +/* Subroutine */ void cla_gbamv_(integer *trans, integer *m, integer *n, integer *kl, integer *ku, real *alpha, complex *ab, integer *ldab, complex *x, integer *incx, real *beta, real *y, integer *incy) { @@ -754,13 +754,13 @@ mv.f"> */ } if (info != 0) { xerbla_("CLA_GBAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -953,7 +953,7 @@ mv.f"> */ } } - return 0; + return; /* End of CLA_GBAMV */ diff --git a/lapack-netlib/SRC/cla_gbrcond_c.c b/lapack-netlib/SRC/cla_gbrcond_c.c index ce023746bd..e6ade54240 100644 --- a/lapack-netlib/SRC/cla_gbrcond_c.c +++ b/lapack-netlib/SRC/cla_gbrcond_c.c @@ -688,10 +688,11 @@ real cla_gbrcond_c_(char *trans, integer *n, integer *kl, integer *ku, extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer kd, ke; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgbtrs_( char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real ainvnm, tmp; diff --git a/lapack-netlib/SRC/cla_gbrcond_x.c b/lapack-netlib/SRC/cla_gbrcond_x.c index 3fc00713fd..2c00c8f761 100644 --- a/lapack-netlib/SRC/cla_gbrcond_x.c +++ b/lapack-netlib/SRC/cla_gbrcond_x.c @@ -680,10 +680,11 @@ real cla_gbrcond_x_(char *trans, integer *n, integer *kl, integer *ku, extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer kd, ke; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgbtrs_( char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real ainvnm, tmp; diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.c b/lapack-netlib/SRC/cla_gbrfsx_extended.c index e193bcce92..f793251d42 100644 --- a/lapack-netlib/SRC/cla_gbrfsx_extended.c +++ b/lapack-netlib/SRC/cla_gbrfsx_extended.c @@ -922,7 +922,7 @@ fsx_extended.f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_gbrfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void cla_gbrfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *kl, integer *ku, integer *nrhs, complex *ab, integer *ldab, complex *afb, integer *ldafb, integer * ipiv, logical *colequ, real *c__, complex *b, integer *ldb, complex * @@ -940,20 +940,20 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__; - extern /* Subroutine */ int cla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void cla_lin_berr_(integer *, integer *, integer * , complex *, real *, real *); real ymin; - extern /* Subroutine */ int blas_cgbmv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_cgbmv_x_(integer *, integer *, integer * , integer *, integer *, complex *, complex *, integer *, complex * , integer *, complex *, complex *, integer *, integer *); real dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_cgbmv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_cgbmv2_x_(integer *, integer *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, integer *); integer i__, j, m; - extern /* Subroutine */ int cla_gbamv_(integer *, integer *, integer *, + extern /* Subroutine */ void cla_gbamv_(integer *, integer *, integer *, integer *, integer *, real *, complex *, integer *, complex *, integer *, real *, real *, integer *), cgbmv_(char *, integer *, integer *, integer *, integer *, complex *, complex *, integer *, @@ -962,16 +962,16 @@ fsx_extended.f"> */ real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); char trans[1]; real normx, normy, myhugeval, prev_dz_z__, yk; extern real slamch_(char *); - extern /* Subroutine */ int cgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void cgbtrs_(char *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real final_dx_x__; - extern /* Subroutine */ int cla_wwaddw_(integer *, complex *, complex *, + extern /* Subroutine */ void cla_wwaddw_(integer *, complex *, complex *, complex *); real final_dz_z__, normdx; extern /* Character */ VOID chla_transtype_(char *, integer *); @@ -1020,7 +1020,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1266,6 +1266,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* cla_gbrfsx_extended__ */ diff --git a/lapack-netlib/SRC/cla_geamv.c b/lapack-netlib/SRC/cla_geamv.c index 856a6168cb..4b936d3fb4 100644 --- a/lapack-netlib/SRC/cla_geamv.c +++ b/lapack-netlib/SRC/cla_geamv.c @@ -683,7 +683,7 @@ mv.f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_geamv_(integer *trans, integer *m, integer *n, real +/* Subroutine */ void cla_geamv_(integer *trans, integer *m, integer *n, real *alpha, complex *a, integer *lda, complex *x, integer *incx, real * beta, real *y, integer *incy) { @@ -739,13 +739,13 @@ mv.f"> */ } if (info != 0) { xerbla_("CLA_GEAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -916,7 +916,7 @@ mv.f"> */ } } - return 0; + return; /* End of CLA_GEAMV */ diff --git a/lapack-netlib/SRC/cla_gercond_c.c b/lapack-netlib/SRC/cla_gercond_c.c index 153a06ca51..d94e84f9ea 100644 --- a/lapack-netlib/SRC/cla_gercond_c.c +++ b/lapack-netlib/SRC/cla_gercond_c.c @@ -669,9 +669,10 @@ real cla_gercond_c_(char *trans, integer *n, complex *a, integer *lda, extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen), - cgetrs_(char *, integer *, integer *, complex *, integer *, + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real ainvnm, tmp; logical notrans; diff --git a/lapack-netlib/SRC/cla_gercond_x.c b/lapack-netlib/SRC/cla_gercond_x.c index f12d87d6e4..ac5eb18679 100644 --- a/lapack-netlib/SRC/cla_gercond_x.c +++ b/lapack-netlib/SRC/cla_gercond_x.c @@ -662,9 +662,10 @@ real cla_gercond_x_(char *trans, integer *n, complex *a, integer *lda, extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen), - cgetrs_(char *, integer *, integer *, complex *, integer *, + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cgetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real ainvnm, tmp; logical notrans; diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.c b/lapack-netlib/SRC/cla_gerfsx_extended.c index 72915a1940..1adc015bb9 100644 --- a/lapack-netlib/SRC/cla_gerfsx_extended.c +++ b/lapack-netlib/SRC/cla_gerfsx_extended.c @@ -908,7 +908,7 @@ fsx_extended.f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_gerfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void cla_gerfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer *ipiv, logical *colequ, real *c__, complex *b, integer *ldb, complex *y, integer *ldy, real *berr_out__, @@ -926,15 +926,15 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__; - extern /* Subroutine */ int cla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void cla_lin_berr_(integer *, integer *, integer * , complex *, real *, real *); real ymin; - extern /* Subroutine */ int blas_cgemv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_cgemv_x_(integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *, integer *); real dxratmax, dzratmax; integer y_prec_state__, i__, j; - extern /* Subroutine */ int blas_cgemv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_cgemv2_x_(integer *, integer *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, integer *), cla_geamv_( integer *, integer *, integer *, real *, complex *, integer *, @@ -945,15 +945,15 @@ fsx_extended.f"> */ real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); char trans[1]; real normx, normy, myhugeval, prev_dz_z__, yk; extern real slamch_(char *); - extern /* Subroutine */ int cgetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cgetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real final_dx_x__; - extern /* Subroutine */ int cla_wwaddw_(integer *, complex *, complex *, + extern /* Subroutine */ void cla_wwaddw_(integer *, complex *, complex *, complex *); real final_dz_z__, normdx; extern /* Character */ VOID chla_transtype_(char *, integer *); @@ -1002,7 +1002,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1247,6 +1247,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* cla_gerfsx_extended__ */ diff --git a/lapack-netlib/SRC/cla_heamv.c b/lapack-netlib/SRC/cla_heamv.c index a7603183e0..cb7a356243 100644 --- a/lapack-netlib/SRC/cla_heamv.c +++ b/lapack-netlib/SRC/cla_heamv.c @@ -687,7 +687,7 @@ mv.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cla_heamv_(integer *uplo, integer *n, real *alpha, +/* Subroutine */ void cla_heamv_(integer *uplo, integer *n, real *alpha, complex *a, integer *lda, complex *x, integer *incx, real *beta, real *y, integer *incy) { @@ -740,13 +740,13 @@ mv.f"> */ } if (info != 0) { xerbla_("CHEMV ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -959,7 +959,7 @@ mv.f"> */ } } - return 0; + return; /* End of CLA_HEAMV */ diff --git a/lapack-netlib/SRC/cla_hercond_c.c b/lapack-netlib/SRC/cla_hercond_c.c index ed575863d1..56d922f1ca 100644 --- a/lapack-netlib/SRC/cla_hercond_c.c +++ b/lapack-netlib/SRC/cla_hercond_c.c @@ -666,12 +666,12 @@ real cla_hercond_c_(char *uplo, integer *n, complex *a, integer *lda, integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int chetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void chetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/cla_hercond_x.c b/lapack-netlib/SRC/cla_hercond_x.c index 962b323a07..271eab4ba6 100644 --- a/lapack-netlib/SRC/cla_hercond_x.c +++ b/lapack-netlib/SRC/cla_hercond_x.c @@ -659,12 +659,12 @@ real cla_hercond_x_(char *uplo, integer *n, complex *a, integer *lda, integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int chetrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void chetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/cla_herfsx_extended.c b/lapack-netlib/SRC/cla_herfsx_extended.c index c4a42e805c..f721e573f3 100644 --- a/lapack-netlib/SRC/cla_herfsx_extended.c +++ b/lapack-netlib/SRC/cla_herfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup complexHEcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_herfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void cla_herfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer *ipiv, logical *colequ, real *c__, complex *b, integer *ldb, complex *y, integer *ldy, real *berr_out__, integer * @@ -924,37 +924,38 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__; - extern /* Subroutine */ int cla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void cla_lin_berr_(integer *, integer *, integer * , complex *, real *, real *); real ymin; - extern /* Subroutine */ int blas_chemv_x_(integer *, integer *, complex * + extern /* Subroutine */ void blas_chemv_x_(integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *, integer *); real dxratmax, dzratmax; integer y_prec_state__, uplo2, i__, j; - extern /* Subroutine */ int blas_chemv2_x_(integer *, integer *, complex + extern /* Subroutine */ void blas_chemv2_x_(integer *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, integer *), cla_heamv_(integer *, integer *, real *, complex *, integer *, complex *, integer *, real *, real *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), ccopy_(integer *, complex *, integer *, complex *, integer *); real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; real normx, normy, myhugeval, prev_dz_z__, yk; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), chetrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void chetrs_( char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real final_dx_x__; - extern /* Subroutine */ int cla_wwaddw_(integer *, complex *, complex *, + extern /* Subroutine */ void cla_wwaddw_(integer *, complex *, complex *, complex *); real final_dz_z__, normdx, prevnormdx; integer cnt; @@ -1021,7 +1022,7 @@ fsx_extended.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLA_HERFSX_EXTENDED", &i__1, (ftnlen)19); - return 0; + return; } eps = slamch_("Epsilon"); myhugeval = slamch_("Overflow"); @@ -1255,6 +1256,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* cla_herfsx_extended__ */ diff --git a/lapack-netlib/SRC/cla_lin_berr.c b/lapack-netlib/SRC/cla_lin_berr.c index 73aa2f4d9e..e4a356f01f 100644 --- a/lapack-netlib/SRC/cla_lin_berr.c +++ b/lapack-netlib/SRC/cla_lin_berr.c @@ -610,7 +610,7 @@ _berr.f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_lin_berr_(integer *n, integer *nz, integer *nrhs, +/* Subroutine */ void cla_lin_berr_(integer *n, integer *nz, integer *nrhs, complex *res, real *ayb, real *berr) { /* System generated locals */ @@ -675,6 +675,6 @@ _berr.f"> */ } } - return 0; + return; } /* cla_lin_berr__ */ diff --git a/lapack-netlib/SRC/cla_porcond_c.c b/lapack-netlib/SRC/cla_porcond_c.c index a3ef77bbf1..c8c18f1022 100644 --- a/lapack-netlib/SRC/cla_porcond_c.c +++ b/lapack-netlib/SRC/cla_porcond_c.c @@ -658,12 +658,12 @@ real cla_porcond_c_(char *uplo, integer *n, complex *a, integer *lda, integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int cpotrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cpotrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/cla_porcond_x.c b/lapack-netlib/SRC/cla_porcond_x.c index d06cfc2e84..8491049238 100644 --- a/lapack-netlib/SRC/cla_porcond_x.c +++ b/lapack-netlib/SRC/cla_porcond_x.c @@ -651,12 +651,12 @@ real cla_porcond_x_(char *uplo, integer *n, complex *a, integer *lda, integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int cpotrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cpotrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/cla_porfsx_extended.c b/lapack-netlib/SRC/cla_porfsx_extended.c index a27c20ef3e..4346519f50 100644 --- a/lapack-netlib/SRC/cla_porfsx_extended.c +++ b/lapack-netlib/SRC/cla_porfsx_extended.c @@ -900,7 +900,7 @@ fsx_extended.f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_porfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void cla_porfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *af, integer *ldaf, logical *colequ, real *c__, complex *b, integer *ldb, complex *y, integer *ldy, real *berr_out__, integer *n_norms__, real * @@ -916,36 +916,36 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__; - extern /* Subroutine */ int cla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void cla_lin_berr_(integer *, integer *, integer * , complex *, real *, real *); real ymin; - extern /* Subroutine */ int blas_chemv_x_(integer *, integer *, complex * + extern /* Subroutine */ void blas_chemv_x_(integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *, integer *); real dxratmax, dzratmax; integer y_prec_state__, uplo2, i__, j; - extern /* Subroutine */ int blas_chemv2_x_(integer *, integer *, complex + extern /* Subroutine */ void blas_chemv2_x_(integer *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, integer *), cla_heamv_(integer *, integer *, real *, complex *, integer *, complex *, integer *, real *, real *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), ccopy_(integer *, complex *, integer *, complex *, integer *); real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); real normx, normy, myhugeval, prev_dz_z__, yk; extern real slamch_(char *); real final_dx_x__; - extern /* Subroutine */ int cla_wwaddw_(integer *, complex *, complex *, + extern /* Subroutine */ void cla_wwaddw_(integer *, complex *, complex *, complex *); real final_dz_z__, normdx; - extern /* Subroutine */ int cpotrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cpotrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real prevnormdx; integer cnt; @@ -992,7 +992,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } eps = slamch_("Epsilon"); myhugeval = slamch_("Overflow"); @@ -1225,6 +1225,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* cla_porfsx_extended__ */ diff --git a/lapack-netlib/SRC/cla_syamv.c b/lapack-netlib/SRC/cla_syamv.c index a86ea19071..bfd3a03a31 100644 --- a/lapack-netlib/SRC/cla_syamv.c +++ b/lapack-netlib/SRC/cla_syamv.c @@ -688,7 +688,7 @@ mv.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cla_syamv_(integer *uplo, integer *n, real *alpha, +/* Subroutine */ void cla_syamv_(integer *uplo, integer *n, real *alpha, complex *a, integer *lda, complex *x, integer *incx, real *beta, real *y, integer *incy) { @@ -741,13 +741,13 @@ mv.f"> */ } if (info != 0) { xerbla_("CLA_SYAMV", &info, (ftnlen)9); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -960,7 +960,7 @@ mv.f"> */ } } - return 0; + return; /* End of CLA_SYAMV */ diff --git a/lapack-netlib/SRC/cla_syrcond_c.c b/lapack-netlib/SRC/cla_syrcond_c.c index 3e5e36b32d..21b09af462 100644 --- a/lapack-netlib/SRC/cla_syrcond_c.c +++ b/lapack-netlib/SRC/cla_syrcond_c.c @@ -666,12 +666,12 @@ real cla_syrcond_c_(char *uplo, integer *n, complex *a, integer *lda, integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/cla_syrcond_x.c b/lapack-netlib/SRC/cla_syrcond_x.c index cbcee5ee2c..f14c0925a1 100644 --- a/lapack-netlib/SRC/cla_syrcond_x.c +++ b/lapack-netlib/SRC/cla_syrcond_x.c @@ -659,12 +659,12 @@ real cla_syrcond_x_(char *uplo, integer *n, complex *a, integer *lda, integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.c b/lapack-netlib/SRC/cla_syrfsx_extended.c index 3d2e57fbe4..f76de38553 100644 --- a/lapack-netlib/SRC/cla_syrfsx_extended.c +++ b/lapack-netlib/SRC/cla_syrfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_syrfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void cla_syrfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer *ipiv, logical *colequ, real *c__, complex *b, integer *ldb, complex *y, integer *ldy, real *berr_out__, integer * @@ -924,26 +924,26 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__; - extern /* Subroutine */ int cla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void cla_lin_berr_(integer *, integer *, integer * , complex *, real *, real *); real ymin, dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_csymv_x_(integer *, integer *, complex * + extern /* Subroutine */ void blas_csymv_x_(integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *, integer *); integer uplo2, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int blas_csymv2_x_(integer *, integer *, complex + extern /* Subroutine */ void blas_csymv2_x_(integer *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, complex *, complex *, integer *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *); real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int cla_syamv_(integer *, integer *, real *, + extern /* Subroutine */ void cla_syamv_(integer *, integer *, real *, complex *, integer *, complex *, integer *, real *, real *, integer *), csymv_(char *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); @@ -951,10 +951,10 @@ fsx_extended.f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real final_dx_x__; - extern /* Subroutine */ int cla_wwaddw_(integer *, complex *, complex *, + extern /* Subroutine */ void cla_wwaddw_(integer *, complex *, complex *, complex *); real final_dz_z__, normdx; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real prevnormdx; integer cnt; @@ -1021,7 +1021,7 @@ fsx_extended.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLA_SYRFSX_EXTENDED", &i__1, (ftnlen)19); - return 0; + return; } eps = slamch_("Epsilon"); myhugeval = slamch_("Overflow"); @@ -1255,6 +1255,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* cla_syrfsx_extended__ */ diff --git a/lapack-netlib/SRC/cla_wwaddw.c b/lapack-netlib/SRC/cla_wwaddw.c index 4fa11ff39a..34f2f6a78c 100644 --- a/lapack-netlib/SRC/cla_wwaddw.c +++ b/lapack-netlib/SRC/cla_wwaddw.c @@ -590,7 +590,7 @@ ddw.f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cla_wwaddw_(integer *n, complex *x, complex *y, complex +/* Subroutine */ void cla_wwaddw_(integer *n, complex *x, complex *y, complex *w) { /* System generated locals */ @@ -638,6 +638,6 @@ ddw.f"> */ x[i__2].r = s.r, x[i__2].i = s.i; /* L10: */ } - return 0; + return; } /* cla_wwaddw__ */ diff --git a/lapack-netlib/SRC/clabrd.c b/lapack-netlib/SRC/clabrd.c index 65bd6678f1..08cbe07dc5 100644 --- a/lapack-netlib/SRC/clabrd.c +++ b/lapack-netlib/SRC/clabrd.c @@ -726,7 +726,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clabrd_(integer *m, integer *n, integer *nb, complex *a, +/* Subroutine */ void clabrd_(integer *m, integer *n, integer *nb, complex *a, integer *lda, real *d__, real *e, complex *tauq, complex *taup, complex *x, integer *ldx, complex *y, integer *ldy) { @@ -738,7 +738,7 @@ f"> */ /* Local variables */ integer i__; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), clarfg_(integer *, complex *, complex *, @@ -773,7 +773,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (*m >= *n) { @@ -1068,7 +1068,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of CLABRD */ diff --git a/lapack-netlib/SRC/clacgv.c b/lapack-netlib/SRC/clacgv.c index db4ca1ad64..1b8411e5e7 100644 --- a/lapack-netlib/SRC/clacgv.c +++ b/lapack-netlib/SRC/clacgv.c @@ -583,7 +583,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clacgv_(integer *n, complex *x, integer *incx) +/* Subroutine */ void clacgv_(integer *n, complex *x, integer *incx) { /* System generated locals */ integer i__1, i__2; @@ -628,7 +628,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of CLACGV */ diff --git a/lapack-netlib/SRC/clacn2.c b/lapack-netlib/SRC/clacn2.c index f8f454a1e6..e8ba4b0c70 100644 --- a/lapack-netlib/SRC/clacn2.c +++ b/lapack-netlib/SRC/clacn2.c @@ -647,7 +647,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clacn2_(integer *n, complex *v, complex *x, real *est, +/* Subroutine */ void clacn2_(integer *n, complex *v, complex *x, real *est, integer *kase, integer *isave) { /* System generated locals */ @@ -660,7 +660,7 @@ f"> */ integer i__; real absxi; integer jlast; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern integer icmax1_(integer *, complex *, integer *); extern real scsum1_(integer *, complex *, integer *), slamch_(char *); @@ -694,7 +694,7 @@ f"> */ } *kase = 1; isave[1] = 1; - return 0; + return; } switch (isave[1]) { @@ -735,7 +735,7 @@ f"> */ } *kase = 2; isave[1] = 2; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -757,7 +757,7 @@ f"> */ x[i__1].r = 1.f, x[i__1].i = 0.f; *kase = 1; isave[1] = 3; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -790,7 +790,7 @@ f"> */ } *kase = 2; isave[1] = 4; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 4) */ /* X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -818,7 +818,7 @@ f"> */ } *kase = 1; isave[1] = 5; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -832,7 +832,7 @@ f"> */ L130: *kase = 0; - return 0; + return; /* End of CLACN2 */ diff --git a/lapack-netlib/SRC/clacon.c b/lapack-netlib/SRC/clacon.c index 505656461f..243ecd11be 100644 --- a/lapack-netlib/SRC/clacon.c +++ b/lapack-netlib/SRC/clacon.c @@ -628,7 +628,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clacon_(integer *n, complex *v, complex *x, real *est, +/* Subroutine */ void clacon_(integer *n, complex *v, complex *x, real *est, integer *kase) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ static integer jump, i__, j; static real absxi; static integer jlast; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern integer icmax1_(integer *, complex *, integer *); extern real scsum1_(integer *, complex *, integer *), slamch_(char *); @@ -675,7 +675,7 @@ f"> */ } *kase = 1; jump = 1; - return 0; + return; } switch (jump) { @@ -716,7 +716,7 @@ f"> */ } *kase = 2; jump = 2; - return 0; + return; /* ................ ENTRY (JUMP = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -738,7 +738,7 @@ f"> */ x[i__1].r = 1.f, x[i__1].i = 0.f; *kase = 1; jump = 3; - return 0; + return; /* ................ ENTRY (JUMP = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -771,7 +771,7 @@ f"> */ } *kase = 2; jump = 4; - return 0; + return; /* ................ ENTRY (JUMP = 4) */ /* X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -799,7 +799,7 @@ f"> */ } *kase = 1; jump = 5; - return 0; + return; /* ................ ENTRY (JUMP = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -813,7 +813,7 @@ f"> */ L130: *kase = 0; - return 0; + return; /* End of CLACON */ diff --git a/lapack-netlib/SRC/clacp2.c b/lapack-netlib/SRC/clacp2.c index fb53cd0141..f41ab98183 100644 --- a/lapack-netlib/SRC/clacp2.c +++ b/lapack-netlib/SRC/clacp2.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clacp2_(char *uplo, integer *m, integer *n, real *a, +/* Subroutine */ void clacp2_(char *uplo, integer *m, integer *n, real *a, integer *lda, complex *b, integer *ldb) { /* System generated locals */ @@ -682,7 +682,7 @@ f"> */ } } - return 0; + return; /* End of CLACP2 */ diff --git a/lapack-netlib/SRC/clacpy.c b/lapack-netlib/SRC/clacpy.c index 301460b6ea..e2b988a937 100644 --- a/lapack-netlib/SRC/clacpy.c +++ b/lapack-netlib/SRC/clacpy.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clacpy_(char *uplo, integer *m, integer *n, complex *a, +/* Subroutine */ void clacpy_(char *uplo, integer *m, integer *n, complex *a, integer *lda, complex *b, integer *ldb) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ } } - return 0; + return; /* End of CLACPY */ diff --git a/lapack-netlib/SRC/clacrm.c b/lapack-netlib/SRC/clacrm.c index d4d6b0f97c..f4633af64d 100644 --- a/lapack-netlib/SRC/clacrm.c +++ b/lapack-netlib/SRC/clacrm.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clacrm_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void clacrm_(integer *m, integer *n, complex *a, integer *lda, real *b, integer *ldb, complex *c__, integer *ldc, real *rwork) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); @@ -669,7 +669,7 @@ f"> */ /* Function Body */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -724,7 +724,7 @@ f"> */ /* L80: */ } - return 0; + return; /* End of CLACRM */ diff --git a/lapack-netlib/SRC/clacrt.c b/lapack-netlib/SRC/clacrt.c index cef37e6eb9..9f89ba906b 100644 --- a/lapack-netlib/SRC/clacrt.c +++ b/lapack-netlib/SRC/clacrt.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clacrt_(integer *n, complex *cx, integer *incx, complex * +/* Subroutine */ void clacrt_(integer *n, complex *cx, integer *incx, complex * cy, integer *incy, complex *c__, complex *s) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (*incx == 1 && *incy == 1) { goto L20; @@ -683,7 +683,7 @@ f"> */ iy += *incy; /* L10: */ } - return 0; + return; /* Code for both increments equal to 1 */ @@ -711,6 +711,6 @@ f"> */ cx[i__2].r = ctemp.r, cx[i__2].i = ctemp.i; /* L30: */ } - return 0; + return; } /* clacrt_ */ diff --git a/lapack-netlib/SRC/cladiv.c b/lapack-netlib/SRC/cladiv.c index adee25b4b7..fb8c84aff4 100644 --- a/lapack-netlib/SRC/cladiv.c +++ b/lapack-netlib/SRC/cladiv.c @@ -583,7 +583,7 @@ f"> */ /* Local variables */ real zi, zr; - extern /* Subroutine */ int sladiv_(real *, real *, real *, real *, real * + extern /* Subroutine */ void sladiv_(real *, real *, real *, real *, real * , real *); diff --git a/lapack-netlib/SRC/claed0.c b/lapack-netlib/SRC/claed0.c index 21e4083977..185f648444 100644 --- a/lapack-netlib/SRC/claed0.c +++ b/lapack-netlib/SRC/claed0.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int claed0_(integer *qsiz, integer *n, real *d__, real *e, +/* Subroutine */ void claed0_(integer *qsiz, integer *n, real *d__, real *e, complex *q, integer *ldq, complex *qstore, integer *ldqs, real *rwork, integer *iwork, integer *info) { @@ -672,19 +672,19 @@ f"> */ /* Local variables */ real temp; integer curr, i__, j, k, iperm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer indxq, iwrem; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer iqptr; - extern /* Subroutine */ int claed7_(integer *, integer *, integer *, + extern /* Subroutine */ void claed7_(integer *, integer *, integer *, integer *, integer *, integer *, real *, complex *, integer *, real *, integer *, real *, integer *, integer *, integer *, integer *, integer *, real *, complex *, real *, integer *, integer *); integer tlvls, ll, iq; - extern /* Subroutine */ int clacrm_(integer *, integer *, complex *, + extern /* Subroutine */ void clacrm_(integer *, integer *, complex *, integer *, real *, integer *, complex *, integer *, real *); integer igivcl; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -692,7 +692,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); integer igivnm, submat, curprb, subpbs, igivpt, curlvl, matsiz, iprmpt, smlsiz; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); integer lgn, msd2, smm1, spm1, spm2; @@ -741,13 +741,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAED0", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } smlsiz = ilaenv_(&c__9, "CLAED0", " ", &c__0, &c__0, &c__0, &c__0, ( @@ -796,10 +796,10 @@ f"> */ temp = log((real) (*n)) / log(2.f); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; @@ -847,7 +847,7 @@ f"> */ ++curr; if (*info > 0) { *info = submat * (*n + 1) + submat + matsiz - 1; - return 0; + return; } k = 1; i__2 = iwork[i__ + 1]; @@ -897,7 +897,7 @@ f"> */ q_dim1 + 1], &rwork[iwrem], &iwork[subpbs + 1], info); if (*info > 0) { *info = submat * (*n + 1) + submat + matsiz - 1; - return 0; + return; } iwork[i__ / 2 + 1] = iwork[i__ + 2]; /* L90: */ @@ -922,7 +922,7 @@ f"> */ } scopy_(n, &rwork[1], &c__1, &d__[1], &c__1); - return 0; + return; /* End of CLAED0 */ diff --git a/lapack-netlib/SRC/claed7.c b/lapack-netlib/SRC/claed7.c index 49fc9ed4b0..2c9aef5a97 100644 --- a/lapack-netlib/SRC/claed7.c +++ b/lapack-netlib/SRC/claed7.c @@ -762,7 +762,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int claed7_(integer *n, integer *cutpnt, integer *qsiz, +/* Subroutine */ void claed7_(integer *n, integer *cutpnt, integer *qsiz, integer *tlvls, integer *curlvl, integer *curpbm, real *d__, complex * q, integer *ldq, real *rho, integer *indxq, real *qstore, integer * qptr, integer *prmptr, integer *perm, integer *givptr, integer * @@ -774,7 +774,7 @@ f"> */ /* Local variables */ integer indx, curr, i__, k, indxc, indxp, n1, n2; - extern /* Subroutine */ int claed8_(integer *, integer *, integer *, + extern /* Subroutine */ void claed8_(integer *, integer *, integer *, complex *, integer *, real *, real *, integer *, real *, real *, complex *, integer *, real *, integer *, integer *, integer *, integer *, integer *, integer *, real *, integer *), slaed9_( @@ -784,10 +784,11 @@ f"> */ integer *, integer *, integer *, real *, real *, integer *, real * , real *, integer *); integer idlmda, iq, iw; - extern /* Subroutine */ int clacrm_(integer *, integer *, complex *, + extern /* Subroutine */ void clacrm_(integer *, integer *, complex *, integer *, real *, integer *, complex *, integer *, real *); integer iz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slamrg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slamrg_( integer *, integer *, real *, integer *, integer *, integer *); integer coltyp, ptr; @@ -838,13 +839,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAED7", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* The following values are for bookkeeping purposes only. They are */ @@ -864,11 +865,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; @@ -906,7 +907,7 @@ f"> */ i__1 = k; qptr[curr + 1] = qptr[curr] + i__1 * i__1; if (*info != 0) { - return 0; + return; } /* Prepare the INDXQ sorting premutation. */ @@ -923,7 +924,7 @@ f"> */ } } - return 0; + return; /* End of CLAED7 */ diff --git a/lapack-netlib/SRC/claed8.c b/lapack-netlib/SRC/claed8.c index 1bd9439cb6..f6c3a3106d 100644 --- a/lapack-netlib/SRC/claed8.c +++ b/lapack-netlib/SRC/claed8.c @@ -741,7 +741,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int claed8_(integer *k, integer *n, integer *qsiz, complex * +/* Subroutine */ void claed8_(integer *k, integer *n, integer *qsiz, complex * q, integer *ldq, real *d__, real *rho, integer *cutpnt, real *z__, real *dlamda, complex *q2, integer *ldq2, real *w, integer *indxp, integer *indx, integer *indxq, integer *perm, integer *givptr, @@ -756,22 +756,22 @@ f"> */ real c__; integer i__, j; real s, t; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); integer k2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n1, n2; extern real slapy2_(real *, real *); integer jp; extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slamrg_(integer *, integer *, real *, integer + extern /* Subroutine */ void slamrg_(integer *, integer *, real *, integer *, integer *, integer *); integer n1p1; real eps, tau, tol; @@ -823,7 +823,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAED8", &i__1, (ftnlen)6); - return 0; + return; } /* Need to initialize GIVPTR to O here in case of quick exit */ @@ -836,7 +836,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } n1 = *cutpnt; @@ -902,7 +902,7 @@ f"> */ /* L50: */ } clacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq); - return 0; + return; } /* If there are multiple eigenvalues then the problem deflates. Here */ @@ -1036,7 +1036,7 @@ f"> */ 1) * q_dim1 + 1], ldq); } - return 0; + return; /* End of CLAED8 */ diff --git a/lapack-netlib/SRC/claein.c b/lapack-netlib/SRC/claein.c index 3ecbcb982b..21b2d45fe0 100644 --- a/lapack-netlib/SRC/claein.c +++ b/lapack-netlib/SRC/claein.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claein_(logical *rightv, logical *noinit, integer *n, +/* Subroutine */ void claein_(logical *rightv, logical *noinit, integer *n, complex *h__, integer *ldh, complex *w, complex *v, complex *b, integer *ldb, real *rwork, real *eps3, real *smlnum, integer *info) { @@ -683,7 +683,7 @@ f"> */ complex ei, ej; extern integer icamax_(integer *, complex *, integer *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real *, real *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -953,7 +953,7 @@ f"> */ abs(r__2))); csscal_(n, &r__3, &v[1], &c__1); - return 0; + return; /* End of CLAEIN */ diff --git a/lapack-netlib/SRC/claesy.c b/lapack-netlib/SRC/claesy.c index 79d4047da5..cd217e71dc 100644 --- a/lapack-netlib/SRC/claesy.c +++ b/lapack-netlib/SRC/claesy.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complexSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claesy_(complex *a, complex *b, complex *c__, complex * +/* Subroutine */ void claesy_(complex *a, complex *b, complex *c__, complex * rt1, complex *rt2, complex *evscal, complex *cs1, complex *sn1) { /* System generated locals */ @@ -750,7 +750,7 @@ f"> */ evscal->r = 0.f, evscal->i = 0.f; } } - return 0; + return; /* End of CLAESY */ diff --git a/lapack-netlib/SRC/claev2.c b/lapack-netlib/SRC/claev2.c index 00fe82cae7..c1aaaae937 100644 --- a/lapack-netlib/SRC/claev2.c +++ b/lapack-netlib/SRC/claev2.c @@ -632,7 +632,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claev2_(complex *a, complex *b, complex *c__, real *rt1, +/* Subroutine */ void claev2_(complex *a, complex *b, complex *c__, real *rt1, real *rt2, real *cs1, complex *sn1) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ /* Local variables */ real t; complex w; - extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slaev2_(real *, real *, real *, real *, real * , real *, real *); @@ -669,7 +669,7 @@ f"> */ slaev2_(&r__1, &r__2, &r__3, rt1, rt2, cs1, &t); q__1.r = t * w.r, q__1.i = t * w.i; sn1->r = q__1.r, sn1->i = q__1.i; - return 0; + return; /* End of CLAEV2 */ diff --git a/lapack-netlib/SRC/clag2z.c b/lapack-netlib/SRC/clag2z.c index d30f9d5385..98bf82b8df 100644 --- a/lapack-netlib/SRC/clag2z.c +++ b/lapack-netlib/SRC/clag2z.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clag2z_(integer *m, integer *n, complex *sa, integer * +/* Subroutine */ void clag2z_(integer *m, integer *n, complex *sa, integer * ldsa, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -652,7 +652,7 @@ f"> */ } /* L20: */ } - return 0; + return; /* End of CLAG2Z */ diff --git a/lapack-netlib/SRC/clags2.c b/lapack-netlib/SRC/clags2.c index 8090c1d84e..2a58b634e4 100644 --- a/lapack-netlib/SRC/clags2.c +++ b/lapack-netlib/SRC/clags2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clags2_(logical *upper, real *a1, complex *a2, real *a3, +/* Subroutine */ void clags2_(logical *upper, real *a1, complex *a2, real *a3, real *b1, complex *b2, real *b3, real *csu, complex *snu, real *csv, complex *snv, real *csq, complex *snq) { @@ -683,7 +683,7 @@ f"> */ real d__; complex r__, d1; real s1, s2, fb, fc; - extern /* Subroutine */ int slasv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slasv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *), clartg_(complex *, complex *, real *, complex *, complex *); complex ua11, ua12, ua21, ua22, vb11, vb12, vb21, vb22; @@ -1028,7 +1028,7 @@ f"> */ } - return 0; + return; /* End of CLAGS2 */ diff --git a/lapack-netlib/SRC/clagtm.c b/lapack-netlib/SRC/clagtm.c index a4911f2900..d1db98464d 100644 --- a/lapack-netlib/SRC/clagtm.c +++ b/lapack-netlib/SRC/clagtm.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clagtm_(char *trans, integer *n, integer *nrhs, real * +/* Subroutine */ void clagtm_(char *trans, integer *n, integer *nrhs, real * alpha, complex *dl, complex *d__, complex *du, complex *x, integer * ldx, real *beta, complex *b, integer *ldb) { @@ -690,7 +690,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } /* Multiply B by BETA if BETA.NE.1. */ @@ -1160,7 +1160,7 @@ f"> */ } } } - return 0; + return; /* End of CLAGTM */ diff --git a/lapack-netlib/SRC/clahef.c b/lapack-netlib/SRC/clahef.c index 57b7546d51..1cf2bdbba9 100644 --- a/lapack-netlib/SRC/clahef.c +++ b/lapack-netlib/SRC/clahef.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int clahef_(char *uplo, integer *n, integer *nb, integer *kb, +/* Subroutine */ void clahef_(char *uplo, integer *n, integer *nb, integer *kb, complex *a, integer *lda, integer *ipiv, complex *w, integer *ldw, integer *info) { @@ -704,11 +704,11 @@ f"> */ /* Local variables */ integer imax, jmax, j, k; real t, alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, @@ -718,10 +718,10 @@ f"> */ complex d11, d21, d22; integer jb, jj, kk, jp, kp; real absakk; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); integer kw; extern integer icamax_(integer *, complex *, integer *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real colmax, rowmax; integer kkw; @@ -1730,7 +1730,7 @@ f"> */ *kb = k - 1; } - return 0; + return; /* End of CLAHEF */ diff --git a/lapack-netlib/SRC/clahef_aa.c b/lapack-netlib/SRC/clahef_aa.c index 65808ee9ce..535a770e1a 100644 --- a/lapack-netlib/SRC/clahef_aa.c +++ b/lapack-netlib/SRC/clahef_aa.c @@ -658,7 +658,7 @@ aa.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int clahef_aa_(char *uplo, integer *j1, integer *m, integer +/* Subroutine */ void clahef_aa_(char *uplo, integer *j1, integer *m, integer *nb, complex *a, integer *lda, integer *ipiv, complex *h__, integer * ldh, complex *work) { @@ -670,19 +670,19 @@ aa.f"> */ /* Local variables */ integer j, k; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer i1, k1, i2, mj; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); extern integer icamax_(integer *, complex *, integer *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); complex piv; @@ -1097,7 +1097,7 @@ aa.f"> */ L40: ; } - return 0; + return; /* End of CLAHEF_AA */ diff --git a/lapack-netlib/SRC/clahef_rk.c b/lapack-netlib/SRC/clahef_rk.c index db17fe19f5..9b3d074537 100644 --- a/lapack-netlib/SRC/clahef_rk.c +++ b/lapack-netlib/SRC/clahef_rk.c @@ -776,7 +776,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int clahef_rk_(char *uplo, integer *n, integer *nb, integer +/* Subroutine */ void clahef_rk_(char *uplo, integer *n, integer *nb, integer *kb, complex *a, integer *lda, complex *e, integer *ipiv, complex *w, integer *ldw, integer *info) { @@ -789,29 +789,29 @@ rk.f"> */ logical done; integer imax, jmax, j, k, p; real t, alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real sfmin; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer itemp; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; real stemp, r1; complex d11, d21, d22; integer jb, ii, jj, kk, kp; real absakk; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); integer kw; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real colmax, rowmax; integer kkw; @@ -2028,7 +2028,7 @@ rk.f"> */ *kb = k - 1; } - return 0; + return; /* End of CLAHEF_RK */ diff --git a/lapack-netlib/SRC/clahef_rook.c b/lapack-netlib/SRC/clahef_rook.c index 0b1b8218bb..ed3e91ecfc 100644 --- a/lapack-netlib/SRC/clahef_rook.c +++ b/lapack-netlib/SRC/clahef_rook.c @@ -698,7 +698,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int clahef_rook_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void clahef_rook_(char *uplo, integer *n, integer *nb, integer *kb, complex *a, integer *lda, integer *ipiv, complex *w, integer *ldw, integer *info) { @@ -711,29 +711,29 @@ rook.f"> */ logical done; integer imax, jmax, j, k, p; real t, alpha; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real sfmin; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer itemp; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; real stemp, r1; complex d11, d21, d22; integer jb, ii, jj, kk, kp; real absakk; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); integer kw; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real colmax; integer jp1, jp2; @@ -1971,7 +1971,7 @@ rook.f"> */ *kb = k - 1; } - return 0; + return; /* End of CLAHEF_ROOK */ diff --git a/lapack-netlib/SRC/clahqr.c b/lapack-netlib/SRC/clahqr.c index 3ebf092aa9..a862b32bc1 100644 --- a/lapack-netlib/SRC/clahqr.c +++ b/lapack-netlib/SRC/clahqr.c @@ -708,7 +708,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clahqr_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void clahqr_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w, integer *iloz, integer *ihiz, complex *z__, integer *ldz, integer * info) @@ -723,7 +723,7 @@ f"> */ integer i__, j, k, l, m; real s; complex t, u, v[2], x, y; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *); integer itmax; @@ -737,7 +737,7 @@ f"> */ real h21; complex h22, sc; integer nh; - extern /* Subroutine */ int slabad_(real *, real *), clarfg_(integer *, + extern /* Subroutine */ void slabad_(real *, real *), clarfg_(integer *, complex *, complex *, integer *, complex *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); extern real slamch_(char *); @@ -775,13 +775,13 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*ilo == *ihi) { i__1 = *ilo; i__2 = *ilo + *ilo * h_dim1; w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i; - return 0; + return; } /* ==== clear out the trash ==== */ @@ -1296,7 +1296,7 @@ f"> */ /* Failure to converge in remaining number of iterations */ *info = i__; - return 0; + return; L140: @@ -1312,7 +1312,7 @@ f"> */ goto L30; L150: - return 0; + return; /* End of CLAHQR */ diff --git a/lapack-netlib/SRC/clahr2.c b/lapack-netlib/SRC/clahr2.c index 00bd4a8a04..09f3ccc7cc 100644 --- a/lapack-netlib/SRC/clahr2.c +++ b/lapack-netlib/SRC/clahr2.c @@ -699,7 +699,7 @@ f"> */ /* > Mathematical Software, 32(2):180-194, June 2006. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clahr2_(integer *n, integer *k, integer *nb, complex *a, +/* Subroutine */ void clahr2_(integer *n, integer *k, integer *nb, complex *a, integer *lda, complex *tau, complex *t, integer *ldt, complex *y, integer *ldy) { @@ -710,7 +710,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *), cgemv_(char *, integer *, @@ -723,7 +723,7 @@ f"> */ integer *), ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); complex ei; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *), clacgv_(integer *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -754,7 +754,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -897,7 +897,7 @@ f"> */ ctrmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b2, &t[ t_offset], ldt, &y[y_offset], ldy); - return 0; + return; /* End of CLAHR2 */ diff --git a/lapack-netlib/SRC/claic1.c b/lapack-netlib/SRC/claic1.c index 91028074e6..bde27f3761 100644 --- a/lapack-netlib/SRC/claic1.c +++ b/lapack-netlib/SRC/claic1.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claic1_(integer *job, integer *j, complex *x, real *sest, +/* Subroutine */ void claic1_(integer *job, integer *j, complex *x, real *sest, complex *w, complex *gamma, real *sestpr, complex *s, complex *c__) { /* System generated locals */ @@ -721,7 +721,7 @@ f"> */ c__->r = q__1.r, c__->i = q__1.i; *sestpr = s1 * tmp; } - return 0; + return; } else if (absgam <= eps * absest) { s->r = 1.f, s->i = 0.f; c__->r = 0.f, c__->i = 0.f; @@ -729,7 +729,7 @@ f"> */ s1 = absest / tmp; s2 = absalp / tmp; *sestpr = tmp * sqrt(s1 * s1 + s2 * s2); - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -742,7 +742,7 @@ f"> */ c__->r = 1.f, c__->i = 0.f; *sestpr = s1; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -767,7 +767,7 @@ f"> */ q__1.r = q__2.r / scl, q__1.i = q__2.i / scl; c__->r = q__1.r, c__->i = q__1.i; } - return 0; + return; } else { /* normal case */ @@ -816,7 +816,7 @@ f"> */ q__1.r = cosine.r / tmp, q__1.i = cosine.i / tmp; c__->r = q__1.r, c__->i = q__1.i; *sestpr = sqrt(t + 1.f) * absest; - return 0; + return; } } else if (*job == 2) { @@ -857,12 +857,12 @@ f"> */ s->r = q__1.r, s->i = q__1.i; q__1.r = c__->r / tmp, q__1.i = c__->i / tmp; c__->r = q__1.r, c__->i = q__1.i; - return 0; + return; } else if (absgam <= eps * absest) { s->r = 0.f, s->i = 0.f; c__->r = 1.f, c__->i = 0.f; *sestpr = absgam; - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -875,7 +875,7 @@ f"> */ c__->r = 0.f, c__->i = 0.f; *sestpr = s2; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -906,7 +906,7 @@ f"> */ q__1.r = q__2.r / scl, q__1.i = q__2.i / scl; c__->r = q__1.r, c__->i = q__1.i; } - return 0; + return; } else { /* normal case */ @@ -989,11 +989,11 @@ f"> */ s->r = q__1.r, s->i = q__1.i; q__1.r = cosine.r / tmp, q__1.i = cosine.i / tmp; c__->r = q__1.r, c__->i = q__1.i; - return 0; + return; } } - return 0; + return; /* End of CLAIC1 */ diff --git a/lapack-netlib/SRC/clals0.c b/lapack-netlib/SRC/clals0.c index ee2b4367d9..b2d57e34e9 100644 --- a/lapack-netlib/SRC/clals0.c +++ b/lapack-netlib/SRC/clals0.c @@ -786,7 +786,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int clals0_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void clals0_(integer *icompq, integer *nl, integer *nr, integer *sqre, integer *nrhs, complex *b, integer *ldb, complex *bx, integer *ldbx, integer *perm, integer *givptr, integer *givcol, integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real * @@ -807,16 +807,17 @@ f"> */ extern real snrm2_(integer *, real *, integer *); integer i__, j, m, n; real diflj, difrj, dsigj; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), sgemv_(char *, integer *, integer *, real * , real *, integer *, real *, integer *, real *, real *, integer *), csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); extern real slamc3_(real *, real *); real dj; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *), csscal_(integer *, real *, complex *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, - complex *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real dsigjp; integer nlp1; @@ -887,7 +888,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLALS0", &i__1, (ftnlen)6); - return 0; + return; } m = n + *sqre; @@ -1155,7 +1156,7 @@ f"> */ } } - return 0; + return; /* End of CLALS0 */ diff --git a/lapack-netlib/SRC/clalsa.c b/lapack-netlib/SRC/clalsa.c index 4bc3830a9c..711d1a868b 100644 --- a/lapack-netlib/SRC/clalsa.c +++ b/lapack-netlib/SRC/clalsa.c @@ -779,7 +779,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int clalsa_(integer *icompq, integer *smlsiz, integer *n, +/* Subroutine */ void clalsa_(integer *icompq, integer *smlsiz, integer *n, integer *nrhs, complex *b, integer *ldb, complex *bx, integer *ldbx, real *u, integer *ldu, real *vt, integer *k, real *difl, real *difr, real *z__, real *poles, integer *givptr, integer *givcol, integer * @@ -796,20 +796,21 @@ f"> */ /* Local variables */ integer jcol, nlvl, sqre, jrow, i__, j, jimag, jreal, inode, ndiml; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer ndimr; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer i1; - extern /* Subroutine */ int clals0_(integer *, integer *, integer *, + extern /* Subroutine */ void clals0_(integer *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, real *, integer *, real *, real *, real *, integer *); integer ic, lf, nd, ll, nl, nr; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slasdt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slasdt_( integer *, integer *, integer *, integer *, integer *, integer *, integer *); integer im1, nlf, nrf, lvl, ndb1, nlp1, lvl2, nrp1; @@ -890,7 +891,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLALSA", &i__1, (ftnlen)6); - return 0; + return; } /* Book-keeping and setting up the computation tree. */ @@ -1051,7 +1052,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -1065,7 +1066,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1110,7 +1111,7 @@ f"> */ ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; @@ -1270,7 +1271,7 @@ f"> */ L330: - return 0; + return; /* End of CLALSA */ diff --git a/lapack-netlib/SRC/clalsd.c b/lapack-netlib/SRC/clalsd.c index 2fab22a070..bc6135f63c 100644 --- a/lapack-netlib/SRC/clalsd.c +++ b/lapack-netlib/SRC/clalsd.c @@ -702,7 +702,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int clalsd_(char *uplo, integer *smlsiz, integer *n, integer +/* Subroutine */ void clalsd_(char *uplo, integer *smlsiz, integer *n, integer *nrhs, real *d__, real *e, complex *b, integer *ldb, real *rcond, integer *rank, complex *work, real *rwork, integer *iwork, integer * info) @@ -719,41 +719,43 @@ f"> */ k; real r__; integer s, u, jimag, z__, jreal; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer irwib; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer poles, sizei, irwrb, nsize; - extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + extern /* Subroutine */ void csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); integer irwvt, icmpq1, icmpq2; real cs; integer bx; - extern /* Subroutine */ int clalsa_(integer *, integer *, integer *, + extern /* Subroutine */ void clalsa_(integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, real *, real *, real * , real *, integer *, integer *); real sn; - extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, real *, integer *, integer *, complex *, integer *, integer *); integer st; extern real slamch_(char *); - extern /* Subroutine */ int slasda_(integer *, integer *, integer *, + extern /* Subroutine */ void slasda_(integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, real *, real *, real *, real *, integer *, integer *); integer vt; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen), slascl_(char *, + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real * , integer *, integer *); extern integer isamax_(integer *, real *, integer *); integer givcol; - extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void slasdq_(char *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real * , integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, @@ -762,7 +764,7 @@ f"> */ real orgnrm; integer givnum; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); integer givptr, nm1, nrwork, irwwrk, smlszp, st1; real eps; integer iwk; @@ -803,7 +805,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLALSD", &i__1, (ftnlen)6); - return 0; + return; } eps = slamch_("Epsilon"); @@ -821,7 +823,7 @@ f"> */ /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } else if (*n == 1) { if (d__[1] == 0.f) { claset_("A", &c__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); @@ -831,7 +833,7 @@ f"> */ b_offset], ldb, info); d__[1] = abs(d__[1]); } - return 0; + return; } /* Rotate the matrix if it is lower bidiagonal. */ @@ -874,7 +876,7 @@ f"> */ orgnrm = slanst_("M", n, &d__[1], &e[1]); if (orgnrm == 0.f) { claset_("A", n, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); - return 0; + return; } slascl_("G", &c__0, &c__0, &orgnrm, &c_b10, n, &c__1, &d__[1], n, info); @@ -896,7 +898,7 @@ f"> */ slasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &rwork[irwvt], n, &rwork[irwu], n, &rwork[irwwrk], &c__1, &rwork[irwwrk], info); if (*info != 0) { - return 0; + return; } /* In the real version, B is passed to SLASDQ and multiplied */ @@ -1021,7 +1023,7 @@ f"> */ clascl_("G", &c__0, &c__0, &orgnrm, &c_b10, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; } /* Book-keeping and setting up some constants. */ @@ -1121,7 +1123,7 @@ f"> */ rwork[nrwork], &c__1, &rwork[nrwork], info) ; if (*info != 0) { - return 0; + return; } /* In the real version, B is passed to SLASDQ and multiplied */ @@ -1189,7 +1191,7 @@ f"> */ givnum + st1], &rwork[c__ + st1], &rwork[s + st1], & rwork[nrwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } bxst = bx + st1; clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, & @@ -1200,7 +1202,7 @@ f"> */ st1], &rwork[givnum + st1], &rwork[c__ + st1], &rwork[ s + st1], &rwork[nrwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } } st = i__ + 1; @@ -1306,7 +1308,7 @@ f"> */ givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &rwork[ nrwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } } /* L320: */ @@ -1319,7 +1321,7 @@ f"> */ clascl_("G", &c__0, &c__0, &orgnrm, &c_b10, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; /* End of CLALSD */ diff --git a/lapack-netlib/SRC/clamswlq.c b/lapack-netlib/SRC/clamswlq.c index 085587ee14..ed84b30749 100644 --- a/lapack-netlib/SRC/clamswlq.c +++ b/lapack-netlib/SRC/clamswlq.c @@ -713,7 +713,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clamswlq_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void clamswlq_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, complex *a, integer *lda, complex *t, integer *ldt, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) @@ -731,7 +731,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int cgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemlqt_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *), ctpmlqt_(char *, char *, integer *, integer *, integer *, integer @@ -799,10 +799,10 @@ static integer c__0 = 0; i__1 = -(*info); xerbla_("CLAMSWLQ", &i__1, (ftnlen)8); work[1].r = (real) lw, work[1].i = 0.f; - return 0; + return; } else if (lquery) { work[1].r = (real) lw, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ @@ -810,7 +810,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -818,7 +818,7 @@ static integer c__0 = 0; if (*nb <= *k || *nb >= f2cmax(i__1,*k)) { cgemlqt_(side, trans, m, n, k, mb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && tran) { @@ -960,7 +960,7 @@ static integer c__0 = 0; } work[1].r = (real) lw, work[1].i = 0.f; - return 0; + return; /* End of CLAMSWLQ */ diff --git a/lapack-netlib/SRC/clamtsqr.c b/lapack-netlib/SRC/clamtsqr.c index 475199719f..553013785b 100644 --- a/lapack-netlib/SRC/clamtsqr.c +++ b/lapack-netlib/SRC/clamtsqr.c @@ -706,7 +706,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clamtsqr_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void clamtsqr_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, complex *a, integer *lda, complex *t, integer *ldt, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) @@ -724,7 +724,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int cgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemqrt_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *), ctpmqrt_(char *, char *, integer *, integer *, integer *, integer @@ -797,9 +797,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("CLAMTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -807,7 +807,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -815,7 +815,7 @@ static integer c__0 = 0; if (*mb <= *k || *mb >= f2cmax(i__1,*k)) { cgemqrt_(side, trans, m, n, k, nb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && notran) { @@ -957,7 +957,7 @@ static integer c__0 = 0; } work[1].r = (real) lw, work[1].i = 0.f; - return 0; + return; /* End of CLAMTSQR */ diff --git a/lapack-netlib/SRC/clangb.c b/lapack-netlib/SRC/clangb.c index 1bf23d9cdb..7276060032 100644 --- a/lapack-netlib/SRC/clangb.c +++ b/lapack-netlib/SRC/clangb.c @@ -647,11 +647,11 @@ real clangb_(char *norm, integer *n, integer *kl, integer *ku, complex *ab, /* Local variables */ real temp; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k, l; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clange.c b/lapack-netlib/SRC/clange.c index 419f450c1d..7e4e073894 100644 --- a/lapack-netlib/SRC/clange.c +++ b/lapack-netlib/SRC/clange.c @@ -638,11 +638,11 @@ real clange_(char *norm, integer *m, integer *n, complex *a, integer *lda, /* Local variables */ real temp; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clangt.c b/lapack-netlib/SRC/clangt.c index 6d841cc0e5..d9d55a9cad 100644 --- a/lapack-netlib/SRC/clangt.c +++ b/lapack-netlib/SRC/clangt.c @@ -632,7 +632,7 @@ real clangt_(char *norm, integer *n, complex *dl, complex *d__, complex *du) real scale; extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real sum; diff --git a/lapack-netlib/SRC/clanhb.c b/lapack-netlib/SRC/clanhb.c index d58a0eca57..497c9ca2d7 100644 --- a/lapack-netlib/SRC/clanhb.c +++ b/lapack-netlib/SRC/clanhb.c @@ -654,11 +654,11 @@ real clanhb_(char *norm, char *uplo, integer *n, integer *k, complex *ab, /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, l; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clanhe.c b/lapack-netlib/SRC/clanhe.c index 362f4a220f..6b9f8e8266 100644 --- a/lapack-netlib/SRC/clanhe.c +++ b/lapack-netlib/SRC/clanhe.c @@ -647,11 +647,11 @@ real clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *lda, /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clanhf.c b/lapack-netlib/SRC/clanhf.c index 14f4f73470..4755084e5f 100644 --- a/lapack-netlib/SRC/clanhf.c +++ b/lapack-netlib/SRC/clanhf.c @@ -775,7 +775,7 @@ real clanhf_(char *norm, char *transr, char *uplo, integer *n, complex *a, real value; integer n1; real aa; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); integer lda, ifm, noe, ilu; diff --git a/lapack-netlib/SRC/clanhp.c b/lapack-netlib/SRC/clanhp.c index b6da41055b..7b21fdd384 100644 --- a/lapack-netlib/SRC/clanhp.c +++ b/lapack-netlib/SRC/clanhp.c @@ -639,11 +639,11 @@ real clanhp_(char *norm, char *uplo, integer *n, complex *ap, real *work) /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clanhs.c b/lapack-netlib/SRC/clanhs.c index 8b5075f815..3ba45e7a0a 100644 --- a/lapack-netlib/SRC/clanhs.c +++ b/lapack-netlib/SRC/clanhs.c @@ -630,11 +630,11 @@ real clanhs_(char *norm, integer *n, complex *a, integer *lda, real *work) real ret_val; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clanht.c b/lapack-netlib/SRC/clanht.c index 8432e24407..8383b15a91 100644 --- a/lapack-netlib/SRC/clanht.c +++ b/lapack-netlib/SRC/clanht.c @@ -625,10 +625,10 @@ real clanht_(char *norm, integer *n, real *d__, complex *e) real scale; extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum; diff --git a/lapack-netlib/SRC/clansb.c b/lapack-netlib/SRC/clansb.c index 0f5eba4a87..098afaea8f 100644 --- a/lapack-netlib/SRC/clansb.c +++ b/lapack-netlib/SRC/clansb.c @@ -652,11 +652,11 @@ real clansb_(char *norm, char *uplo, integer *n, integer *k, complex *ab, /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, l; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clansp.c b/lapack-netlib/SRC/clansp.c index ca9d24d60e..120443574e 100644 --- a/lapack-netlib/SRC/clansp.c +++ b/lapack-netlib/SRC/clansp.c @@ -637,11 +637,11 @@ real clansp_(char *norm, char *uplo, integer *n, complex *ap, real *work) /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clansy.c b/lapack-netlib/SRC/clansy.c index 1467a8230e..5855b365a5 100644 --- a/lapack-netlib/SRC/clansy.c +++ b/lapack-netlib/SRC/clansy.c @@ -646,11 +646,11 @@ real clansy_(char *norm, char *uplo, integer *n, complex *a, integer *lda, /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clantb.c b/lapack-netlib/SRC/clantb.c index 4078823c2c..f13532cedc 100644 --- a/lapack-netlib/SRC/clantb.c +++ b/lapack-netlib/SRC/clantb.c @@ -662,12 +662,12 @@ real clantb_(char *norm, char *uplo, char *diag, integer *n, integer *k, real ret_val; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, l; logical udiag; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clantp.c b/lapack-netlib/SRC/clantp.c index f1c43d22d5..29cde18685 100644 --- a/lapack-netlib/SRC/clantp.c +++ b/lapack-netlib/SRC/clantp.c @@ -647,12 +647,12 @@ real clantp_(char *norm, char *uplo, char *diag, integer *n, complex *ap, real ret_val; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k; logical udiag; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clantr.c b/lapack-netlib/SRC/clantr.c index 3f63cfa96c..1a7132a70b 100644 --- a/lapack-netlib/SRC/clantr.c +++ b/lapack-netlib/SRC/clantr.c @@ -663,12 +663,12 @@ real clantr_(char *norm, char *uplo, char *diag, integer *m, integer *n, real ret_val; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; logical udiag; extern logical lsame_(char *, char *); real value; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); extern logical sisnan_(real *); real colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/clapll.c b/lapack-netlib/SRC/clapll.c index e2b82f37aa..3263178200 100644 --- a/lapack-netlib/SRC/clapll.c +++ b/lapack-netlib/SRC/clapll.c @@ -609,7 +609,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clapll_(integer *n, complex *x, integer *incx, complex * +/* Subroutine */ void clapll_(integer *n, complex *x, integer *incx, complex * y, integer *incy, real *ssmin) { /* System generated locals */ @@ -618,16 +618,16 @@ f"> */ complex q__1, q__2, q__3, q__4; /* Local variables */ - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; complex c__; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); real ssmax; complex a11, a12, a22; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *); complex tau; @@ -650,7 +650,7 @@ f"> */ /* Function Body */ if (*n <= 1) { *ssmin = 0.f; - return 0; + return; } /* Compute the QR factorization of the N-by-2 matrix ( X Y ) */ @@ -681,7 +681,7 @@ f"> */ r__3 = c_abs(&a22); slas2_(&r__1, &r__2, &r__3, ssmin, &ssmax); - return 0; + return; /* End of CLAPLL */ diff --git a/lapack-netlib/SRC/clapmr.c b/lapack-netlib/SRC/clapmr.c index 614b66a5a1..f61d1923b1 100644 --- a/lapack-netlib/SRC/clapmr.c +++ b/lapack-netlib/SRC/clapmr.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clapmr_(logical *forwrd, integer *m, integer *n, complex +/* Subroutine */ void clapmr_(logical *forwrd, integer *m, integer *n, complex *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*m <= 1) { - return 0; + return; } i__1 = *m; @@ -735,7 +735,7 @@ f"> */ } - return 0; + return; /* End of ZLAPMT */ diff --git a/lapack-netlib/SRC/clapmt.c b/lapack-netlib/SRC/clapmt.c index 8c76420135..6f3f220266 100644 --- a/lapack-netlib/SRC/clapmt.c +++ b/lapack-netlib/SRC/clapmt.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clapmt_(logical *forwrd, integer *m, integer *n, complex +/* Subroutine */ void clapmt_(logical *forwrd, integer *m, integer *n, complex *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *n; @@ -734,7 +734,7 @@ f"> */ } - return 0; + return; /* End of CLAPMT */ diff --git a/lapack-netlib/SRC/claqgb.c b/lapack-netlib/SRC/claqgb.c index 118d7f5fbc..1a00478bd8 100644 --- a/lapack-netlib/SRC/claqgb.c +++ b/lapack-netlib/SRC/claqgb.c @@ -669,7 +669,7 @@ f"> */ /* > \ingroup complexGBauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqgb_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void claqgb_(integer *m, integer *n, integer *kl, integer *ku, complex *ab, integer *ldab, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, char *equed) { @@ -705,7 +705,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -793,7 +793,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of CLAQGB */ diff --git a/lapack-netlib/SRC/claqge.c b/lapack-netlib/SRC/claqge.c index 5879796ff4..c1b2bd196c 100644 --- a/lapack-netlib/SRC/claqge.c +++ b/lapack-netlib/SRC/claqge.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup complexGEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqge_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void claqge_(integer *m, integer *n, complex *a, integer *lda, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, char * equed) { @@ -688,7 +688,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -764,7 +764,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of CLAQGE */ diff --git a/lapack-netlib/SRC/claqhb.c b/lapack-netlib/SRC/claqhb.c index 777f2e7bd8..53f04e3a56 100644 --- a/lapack-netlib/SRC/claqhb.c +++ b/lapack-netlib/SRC/claqhb.c @@ -650,7 +650,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqhb_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void claqhb_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -686,7 +686,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -755,7 +755,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of CLAQHB */ diff --git a/lapack-netlib/SRC/claqhe.c b/lapack-netlib/SRC/claqhe.c index 855bdbf6e5..050de301b5 100644 --- a/lapack-netlib/SRC/claqhe.c +++ b/lapack-netlib/SRC/claqhe.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complexHEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqhe_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void claqhe_(char *uplo, integer *n, complex *a, integer *lda, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -679,7 +679,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -744,7 +744,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of CLAQHE */ diff --git a/lapack-netlib/SRC/claqhp.c b/lapack-netlib/SRC/claqhp.c index 6ba4f0a34c..2597f0cbb7 100644 --- a/lapack-netlib/SRC/claqhp.c +++ b/lapack-netlib/SRC/claqhp.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqhp_(char *uplo, integer *n, complex *ap, real *s, +/* Subroutine */ void claqhp_(char *uplo, integer *n, complex *ap, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -671,7 +671,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -740,7 +740,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of CLAQHP */ diff --git a/lapack-netlib/SRC/claqp2.c b/lapack-netlib/SRC/claqp2.c index 728fce7a32..58e4008d28 100644 --- a/lapack-netlib/SRC/claqp2.c +++ b/lapack-netlib/SRC/claqp2.c @@ -661,7 +661,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int claqp2_(integer *m, integer *n, integer *offset, complex +/* Subroutine */ void claqp2_(integer *m, integer *n, integer *offset, complex *a, integer *lda, integer *jpvt, complex *tau, real *vn1, real *vn2, complex *work) { @@ -674,15 +674,15 @@ f"> */ real temp, temp2; integer i__, j; real tol3z; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); integer offpi; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp; extern real scnrm2_(integer *, complex *, integer *); integer mn; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *); extern real slamch_(char *); extern integer isamax_(integer *, real *, integer *); @@ -801,7 +801,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of CLAQP2 */ diff --git a/lapack-netlib/SRC/claqps.c b/lapack-netlib/SRC/claqps.c index fcb48b9b77..125823b24b 100644 --- a/lapack-netlib/SRC/claqps.c +++ b/lapack-netlib/SRC/claqps.c @@ -693,7 +693,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int claqps_(integer *m, integer *n, integer *offset, integer +/* Subroutine */ void claqps_(integer *m, integer *n, integer *offset, integer *nb, integer *kb, complex *a, integer *lda, integer *jpvt, complex * tau, real *vn1, real *vn2, complex *auxv, complex *f, integer *ldf) { @@ -706,7 +706,7 @@ f"> */ real temp, temp2; integer j, k; real tol3z; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, @@ -715,7 +715,7 @@ f"> */ integer itemp; extern real scnrm2_(integer *, complex *, integer *); integer rk; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, integer *, complex *); extern real slamch_(char *); integer lsticc; @@ -937,7 +937,7 @@ f"> */ goto L60; } - return 0; + return; /* End of CLAQPS */ diff --git a/lapack-netlib/SRC/claqr0.c b/lapack-netlib/SRC/claqr0.c index 68e51f4446..78c0ca8504 100644 --- a/lapack-netlib/SRC/claqr0.c +++ b/lapack-netlib/SRC/claqr0.c @@ -761,7 +761,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claqr0_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void claqr0_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w, integer *iloz, integer *ihiz, complex *z__, integer *ldz, complex * work, integer *lwork, integer *info) @@ -779,7 +779,7 @@ f"> */ integer kacc22, i__, k; real s; integer itmax, nsmax, nwmax, kwtop; - extern /* Subroutine */ int claqr3_(logical *, logical *, integer *, + extern /* Subroutine */ void claqr3_(logical *, logical *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, integer *, integer *, complex *, complex *, integer *, integer *, complex *, integer *, integer *, @@ -793,7 +793,7 @@ f"> */ integer *, complex *, integer *); complex aa, bb, cc, dd; integer ld, nh, nibble, it, ks, kt, ku, kv, ls, ns, nw; - extern /* Subroutine */ int clahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void clahqr_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -847,7 +847,7 @@ f"> */ if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (*n <= 15) { @@ -931,7 +931,7 @@ f"> */ r__1 = (real) lwkopt; q__1.r = r__1, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* ==== CLAHQR/CLAQR0 crossover point ==== */ @@ -1360,6 +1360,6 @@ f"> */ /* ==== End of CLAQR0 ==== */ - return 0; + return; } /* claqr0_ */ diff --git a/lapack-netlib/SRC/claqr1.c b/lapack-netlib/SRC/claqr1.c index 201499eece..9a4d8b1e2b 100644 --- a/lapack-netlib/SRC/claqr1.c +++ b/lapack-netlib/SRC/claqr1.c @@ -617,7 +617,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claqr1_(integer *n, complex *h__, integer *ldh, complex * +/* Subroutine */ void claqr1_(integer *n, complex *h__, integer *ldh, complex * s1, complex *s2, complex *v) { /* System generated locals */ @@ -649,7 +649,7 @@ f"> */ /* Function Body */ if (*n != 2 && *n != 3) { - return 0; + return; } if (*n == 2) { @@ -754,6 +754,6 @@ f"> */ v[3].r = q__1.r, v[3].i = q__1.i; } } - return 0; + return; } /* claqr1_ */ diff --git a/lapack-netlib/SRC/claqr2.c b/lapack-netlib/SRC/claqr2.c index b5941df622..cf50f81c28 100644 --- a/lapack-netlib/SRC/claqr2.c +++ b/lapack-netlib/SRC/claqr2.c @@ -785,7 +785,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claqr2_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void claqr2_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, complex *h__, integer *ldh, integer *iloz, integer *ihiz, complex *z__, integer *ldz, integer * ns, integer *nd, complex *sh, complex *v, integer *ldv, integer *nh, @@ -802,26 +802,26 @@ f"> */ complex beta; integer kcol, info, ifst, ilst, ltop, krow, i__, j; complex s; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *); integer infqr, kwtop; - extern /* Subroutine */ int slabad_(real *, real *), cgehrd_(integer *, + extern /* Subroutine */ void slabad_(real *, real *), cgehrd_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), clarfg_(integer *, complex *, complex *, integer *, complex *); integer jw; extern real slamch_(char *); - extern /* Subroutine */ int clahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void clahqr_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); real safmin, safmax; - extern /* Subroutine */ int ctrexc_(char *, integer *, complex *, integer + extern /* Subroutine */ void ctrexc_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, integer *), cunmhr_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex @@ -899,7 +899,7 @@ f"> */ r__1 = (real) lwkopt; q__1.r = r__1, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* ==== Nothing to do ... */ @@ -908,11 +908,11 @@ f"> */ *nd = 0; work[1].r = 1.f, work[1].i = 0.f; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -959,7 +959,7 @@ f"> */ } } work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1205,6 +1205,6 @@ f"> */ /* ==== End of CLAQR2 ==== */ - return 0; + return; } /* claqr2_ */ diff --git a/lapack-netlib/SRC/claqr3.c b/lapack-netlib/SRC/claqr3.c index 474ea19c67..d72a00622c 100644 --- a/lapack-netlib/SRC/claqr3.c +++ b/lapack-netlib/SRC/claqr3.c @@ -783,7 +783,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claqr3_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void claqr3_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, complex *h__, integer *ldh, integer *iloz, integer *ihiz, complex *z__, integer *ldz, integer * ns, integer *nd, complex *sh, complex *v, integer *ldv, integer *nh, @@ -800,14 +800,14 @@ f"> */ complex beta; integer kcol, info, nmin, ifst, ilst, ltop, krow, i__, j; complex s; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *), cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *); integer infqr, kwtop; - extern /* Subroutine */ int claqr4_(logical *, logical *, integer *, + extern /* Subroutine */ void claqr4_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *, integer *), slabad_(real *, real *), cgehrd_(integer *, integer *, integer *, @@ -815,7 +815,7 @@ f"> */ , clarfg_(integer *, complex *, complex *, integer *, complex *); integer jw; extern real slamch_(char *); - extern /* Subroutine */ int clahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void clahqr_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex @@ -823,7 +823,7 @@ f"> */ real safmin, safmax; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int ctrexc_(char *, integer *, complex *, integer + extern /* Subroutine */ void ctrexc_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, integer *), cunmhr_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex @@ -909,7 +909,7 @@ f"> */ r__1 = (real) lwkopt; q__1.r = r__1, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* ==== Nothing to do ... */ @@ -918,11 +918,11 @@ f"> */ *nd = 0; work[1].r = 1.f, work[1].i = 0.f; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -969,7 +969,7 @@ f"> */ } } work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1223,6 +1223,6 @@ f"> */ /* ==== End of CLAQR3 ==== */ - return 0; + return; } /* claqr3_ */ diff --git a/lapack-netlib/SRC/claqr4.c b/lapack-netlib/SRC/claqr4.c index 191cfcb7be..0abf2416ed 100644 --- a/lapack-netlib/SRC/claqr4.c +++ b/lapack-netlib/SRC/claqr4.c @@ -769,7 +769,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claqr4_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void claqr4_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, complex *h__, integer *ldh, complex *w, integer *iloz, integer *ihiz, complex *z__, integer *ldz, complex * work, integer *lwork, integer *info) @@ -787,7 +787,7 @@ f"> */ integer kacc22, i__, k; real s; integer itmax, nsmax, nwmax, kwtop; - extern /* Subroutine */ int claqr2_(logical *, logical *, integer *, + extern /* Subroutine */ void claqr2_(logical *, logical *, integer *, integer *, integer *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, integer *, integer *, complex *, complex *, integer *, integer *, complex *, integer *, integer *, @@ -798,7 +798,7 @@ f"> */ complex *, integer *, integer *, complex *, integer *); complex aa, bb, cc, dd; integer ld, nh, nibble, it, ks, kt, ku, kv, ls, ns, nw; - extern /* Subroutine */ int clahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void clahqr_(logical *, logical *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -854,7 +854,7 @@ f"> */ if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (*n <= 15) { @@ -938,7 +938,7 @@ f"> */ r__1 = (real) lwkopt; q__1.r = r__1, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* ==== CLAHQR/CLAQR0 crossover point ==== */ @@ -1361,6 +1361,6 @@ f"> */ /* ==== End of CLAQR4 ==== */ - return 0; + return; } /* claqr4_ */ diff --git a/lapack-netlib/SRC/claqr5.c b/lapack-netlib/SRC/claqr5.c index b60f2c2480..f608c6498e 100644 --- a/lapack-netlib/SRC/claqr5.c +++ b/lapack-netlib/SRC/claqr5.c @@ -773,7 +773,7 @@ f"> */ /* > ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claqr5_(logical *wantt, logical *wantz, integer *kacc22, +/* Subroutine */ void claqr5_(logical *wantt, logical *wantz, integer *kacc22, integer *n, integer *ktop, integer *kbot, integer *nshfts, complex *s, complex *h__, integer *ldh, integer *iloz, integer *ihiz, complex * z__, integer *ldz, complex *v, integer *ldv, complex *u, integer *ldu, @@ -793,20 +793,20 @@ f"> */ integer jcol, jlen, jbot, mbot, jtop, jrow, mtop, j, k, m; complex alpha; logical accum; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer ndcol, incol, krcol, nbmps, i2, k1, i4; - extern /* Subroutine */ int claqr1_(integer *, complex *, integer *, + extern /* Subroutine */ void claqr1_(integer *, complex *, integer *, complex *, complex *, complex *); real h11, h12, h21, h22; integer m22; - extern /* Subroutine */ int slabad_(real *, real *), clarfg_(integer *, + extern /* Subroutine */ void slabad_(real *, real *), clarfg_(integer *, complex *, complex *, integer *, complex *); integer ns, nu; extern real slamch_(char *); complex vt[3]; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); real safmin, safmax; @@ -851,14 +851,14 @@ f"> */ /* Function Body */ if (*nshfts < 2) { - return 0; + return; } /* ==== If the active block is empty or 1-by-1, then there */ /* . is nothing to do. ==== */ if (*ktop >= *kbot) { - return 0; + return; } /* ==== NSHFTS is supposed to be even, but if it is odd, */ @@ -1899,6 +1899,6 @@ f"> */ /* ==== End of CLAQR5 ==== */ - return 0; + return; } /* claqr5_ */ diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index 95cc33b9d9..4e6f43a73d 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -279,7 +279,7 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, PARAMETER ( RZERO = 0.0e0, RONE = 1.0e0 ) * .. * .. Local Scalars .. - COMPLEX ALPHA, BETA, CDUM, REFSUM + COMPLEX ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*CONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, ELSE JBOT = KBOT END IF + T1 = CONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ CONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -532,11 +533,13 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * . Mth bulge. Exploit fact that first two elements * . of row are actually zero. ==== * - REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) - H( K+3, K ) = -REFSUM - H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) ) - H( K+3, K+2 ) = H( K+3, K+2 ) - - $ REFSUM*CONJG( V( 3, M ) ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) + REFSUM = V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM*T1 + H( K+3, K+1 ) = -REFSUM*T2 + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*T3 * * ==== Calculate reflection to move * . Mth bulge one step. ==== @@ -571,12 +574,13 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ S( 2*M ), VT ) ALPHA = VT( 1 ) CALL CLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) - REFSUM = CONJG( VT( 1 ) )* - $ ( H( K+1, K )+CONJG( VT( 2 ) )* - $ H( K+2, K ) ) + T1 = CONJG( VT( 1 ) ) + T2 = T1*VT( 2 ) + T3 = T1*VT( 3 ) + REFSUM = H( K+1, K )+CONJG( VT( 2 ) )*H( K+2, K ) * - IF( CABS1( H( K+2, K )-REFSUM*VT( 2 ) )+ - $ CABS1( REFSUM*VT( 3 ) ).GT.ULP* + IF( CABS1( H( K+2, K )-REFSUM*T2 )+ + $ CABS1( REFSUM*T3 ).GT.ULP* $ ( CABS1( H( K, K ) )+CABS1( H( K+1, $ K+1 ) )+CABS1( H( K+2, K+2 ) ) ) ) THEN * @@ -594,7 +598,7 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * . Replace the old reflector with * . the new one. ==== * - H( K+1, K ) = H( K+1, K ) - REFSUM + H( K+1, K ) = H( K+1, K ) - REFSUM*T1 H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) @@ -610,25 +614,28 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + CONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +695,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = CONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )* - $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )* + $ H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +721,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +740,15 @@ SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*CONJG( V( 2, M ) ) + T3 = T1*CONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/claqsb.c b/lapack-netlib/SRC/claqsb.c index 9170ccccbb..dd7f124039 100644 --- a/lapack-netlib/SRC/claqsb.c +++ b/lapack-netlib/SRC/claqsb.c @@ -650,7 +650,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqsb_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void claqsb_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -686,7 +686,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -747,7 +747,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of CLAQSB */ diff --git a/lapack-netlib/SRC/claqsp.c b/lapack-netlib/SRC/claqsp.c index d42141a1fc..2380c85636 100644 --- a/lapack-netlib/SRC/claqsp.c +++ b/lapack-netlib/SRC/claqsp.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqsp_(char *uplo, integer *n, complex *ap, real *s, +/* Subroutine */ void claqsp_(char *uplo, integer *n, complex *ap, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -672,7 +672,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -733,7 +733,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of CLAQSP */ diff --git a/lapack-netlib/SRC/claqsy.c b/lapack-netlib/SRC/claqsy.c index e393959cad..6f78b14213 100644 --- a/lapack-netlib/SRC/claqsy.c +++ b/lapack-netlib/SRC/claqsy.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complexSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claqsy_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void claqsy_(char *uplo, integer *n, complex *a, integer *lda, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -679,7 +679,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -736,7 +736,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of CLAQSY */ diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f index 2284fd65d9..6de40e06ca 100644 --- a/lapack-netlib/SRC/claqz0.f +++ b/lapack-netlib/SRC/claqz0.f @@ -299,7 +299,7 @@ RECURSIVE SUBROUTINE CLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, BNORM, BTOL COMPLEX :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -312,7 +312,7 @@ RECURSIVE SUBROUTINE CLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * External Functions EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD, $ CLARTG, CROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, CLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -466,6 +466,9 @@ RECURSIVE SUBROUTINE CLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = CLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -528,15 +531,8 @@ RECURSIVE SUBROUTINE CLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it @@ -648,7 +644,7 @@ RECURSIVE SUBROUTINE CLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, NS = MIN( NSHIFTS, ISTOP-ISTART2 ) NS = MIN( NS, N_UNDEFLATED ) - SHIFTPOS = ISTOP-N_DEFLATED-N_UNDEFLATED+1 + SHIFTPOS = ISTOP-N_UNDEFLATED+1 IF ( MOD( LD, 6 ) .EQ. 0 ) THEN * diff --git a/lapack-netlib/SRC/clar1v.c b/lapack-netlib/SRC/clar1v.c index e1063dbe8a..c67a19c932 100644 --- a/lapack-netlib/SRC/clar1v.c +++ b/lapack-netlib/SRC/clar1v.c @@ -738,7 +738,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int clar1v_(integer *n, integer *b1, integer *bn, real * +/* Subroutine */ void clar1v_(integer *n, integer *b1, integer *bn, real * lambda, real *d__, real *l, real *ld, real *lld, real *pivmin, real * gaptol, complex *z__, logical *wantnc, integer *negcnt, real *ztz, real *mingma, integer *r__, integer *isuppz, real *nrminv, real * @@ -1090,7 +1090,7 @@ f"> */ *rqcorr = *mingma * tmp; - return 0; + return; /* End of CLAR1V */ diff --git a/lapack-netlib/SRC/clar2v.c b/lapack-netlib/SRC/clar2v.c index 0ac4915ae9..abe5b226b2 100644 --- a/lapack-netlib/SRC/clar2v.c +++ b/lapack-netlib/SRC/clar2v.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clar2v_(integer *n, complex *x, complex *y, complex *z__, +/* Subroutine */ void clar2v_(integer *n, complex *x, complex *y, complex *z__, integer *incx, real *c__, complex *s, integer *incc) { /* System generated locals */ @@ -708,7 +708,7 @@ f"> */ ic += *incc; /* L10: */ } - return 0; + return; /* End of CLAR2V */ diff --git a/lapack-netlib/SRC/clarcm.c b/lapack-netlib/SRC/clarcm.c index 3b727d06c1..ff1af10439 100644 --- a/lapack-netlib/SRC/clarcm.c +++ b/lapack-netlib/SRC/clarcm.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clarcm_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void clarcm_(integer *m, integer *n, real *a, integer *lda, complex *b, integer *ldb, complex *c__, integer *ldc, real *rwork) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); @@ -669,7 +669,7 @@ f"> */ /* Function Body */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -724,7 +724,7 @@ f"> */ /* L80: */ } - return 0; + return; /* End of CLARCM */ diff --git a/lapack-netlib/SRC/clarf.c b/lapack-netlib/SRC/clarf.c index 68f198c250..5d08bee903 100644 --- a/lapack-netlib/SRC/clarf.c +++ b/lapack-netlib/SRC/clarf.c @@ -643,7 +643,7 @@ static integer c__1 = 1; /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clarf_(char *side, integer *m, integer *n, complex *v, +/* Subroutine */ void clarf_(char *side, integer *m, integer *n, complex *v, integer *incv, complex *tau, complex *c__, integer *ldc, complex * work) { @@ -653,7 +653,7 @@ static integer c__1 = 1; /* Local variables */ integer i__; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); @@ -750,7 +750,7 @@ static integer c__1 = 1; c_offset], ldc); } } - return 0; + return; /* End of CLARF */ diff --git a/lapack-netlib/SRC/clarfb.c b/lapack-netlib/SRC/clarfb.c index ebd0d636ce..e62517579c 100644 --- a/lapack-netlib/SRC/clarfb.c +++ b/lapack-netlib/SRC/clarfb.c @@ -710,7 +710,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void clarfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, complex *v, integer *ldv, complex *t, integer *ldt, complex *c__, integer *ldc, complex *work, integer *ldwork) @@ -722,11 +722,11 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), clacgv_(integer *, @@ -761,7 +761,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (lsame_(trans, "N")) { @@ -1382,7 +1382,7 @@ f"> */ } } - return 0; + return; /* End of CLARFB */ diff --git a/lapack-netlib/SRC/clarfb_gett.c b/lapack-netlib/SRC/clarfb_gett.c index 37a7aaeb72..7cf3550e77 100644 --- a/lapack-netlib/SRC/clarfb_gett.c +++ b/lapack-netlib/SRC/clarfb_gett.c @@ -903,7 +903,7 @@ gett.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarfb_gett_(char *ident, integer *m, integer *n, +/* Subroutine */ void clarfb_gett_(char *ident, integer *m, integer *n, integer *k, complex *t, integer *ldt, complex *a, integer *lda, complex *b, integer *ldb, complex *work, integer *ldwork) { @@ -914,11 +914,11 @@ gett.f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); @@ -951,7 +951,7 @@ gett.f"> */ /* Function Body */ if (*m < 0 || *n <= 0 || *k == 0 || *k > *n) { - return 0; + return; } lnotident = ! lsame_(ident, "I"); @@ -1148,7 +1148,7 @@ gett.f"> */ } } - return 0; + return; /* End of CLARFB_GETT */ diff --git a/lapack-netlib/SRC/clarfg.c b/lapack-netlib/SRC/clarfg.c index 1800ae79da..7fe91f1b54 100644 --- a/lapack-netlib/SRC/clarfg.c +++ b/lapack-netlib/SRC/clarfg.c @@ -619,7 +619,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clarfg_(integer *n, complex *alpha, complex *x, integer * +/* Subroutine */ void clarfg_(integer *n, complex *alpha, complex *x, integer * incx, complex *tau) { /* System generated locals */ @@ -630,14 +630,14 @@ f"> */ /* Local variables */ real beta; integer j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); real alphi, alphr, xnorm; extern real scnrm2_(integer *, complex *, integer *), slapy3_(real *, real *, real *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real safmin, rsafmn; integer knt; @@ -658,7 +658,7 @@ f"> */ /* Function Body */ if (*n <= 0) { tau->r = 0.f, tau->i = 0.f; - return 0; + return; } i__1 = *n - 1; @@ -725,7 +725,7 @@ f"> */ alpha->r = beta, alpha->i = 0.f; } - return 0; + return; /* End of CLARFG */ diff --git a/lapack-netlib/SRC/clarfgp.c b/lapack-netlib/SRC/clarfgp.c index 4477c21341..e87a6b7c0a 100644 --- a/lapack-netlib/SRC/clarfgp.c +++ b/lapack-netlib/SRC/clarfgp.c @@ -617,7 +617,7 @@ static complex c_b5 = {1.f,0.f}; /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clarfgp_(integer *n, complex *alpha, complex *x, integer +/* Subroutine */ void clarfgp_(integer *n, complex *alpha, complex *x, integer *incx, complex *tau) { /* System generated locals */ @@ -628,7 +628,7 @@ static complex c_b5 = {1.f,0.f}; /* Local variables */ real beta; integer j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); real alphi, alphr; complex savealpha; @@ -637,7 +637,7 @@ static complex c_b5 = {1.f,0.f}; real *), slapy3_(real *, real *, real *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real bignum, smlnum; integer knt; @@ -658,7 +658,7 @@ static complex c_b5 = {1.f,0.f}; /* Function Body */ if (*n <= 0) { tau->r = 0.f, tau->i = 0.f; - return 0; + return; } i__1 = *n - 1; @@ -815,7 +815,7 @@ static complex c_b5 = {1.f,0.f}; alpha->r = beta, alpha->i = 0.f; } - return 0; + return; /* End of CLARFGP */ diff --git a/lapack-netlib/SRC/clarft.c b/lapack-netlib/SRC/clarft.c index ffe87449c7..e26b8402e1 100644 --- a/lapack-netlib/SRC/clarft.c +++ b/lapack-netlib/SRC/clarft.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarft_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void clarft_(char *direct, char *storev, integer *n, integer * k, complex *v, integer *ldv, complex *tau, complex *t, integer *ldt) { /* System generated locals */ @@ -686,17 +686,17 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer lastv; - extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); integer prevlastv; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -721,7 +721,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } if (lsame_(direct, "F")) { @@ -923,7 +923,7 @@ f"> */ } } } - return 0; + return; /* End of CLARFT */ diff --git a/lapack-netlib/SRC/clarfx.c b/lapack-netlib/SRC/clarfx.c index bcf2c48f55..d5d61945fb 100644 --- a/lapack-netlib/SRC/clarfx.c +++ b/lapack-netlib/SRC/clarfx.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clarfx_(char *side, integer *m, integer *n, complex *v, +/* Subroutine */ void clarfx_(char *side, integer *m, integer *n, complex *v, complex *tau, complex *c__, integer *ldc, complex *work) { /* System generated locals */ @@ -644,7 +644,7 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); complex t1, t2, t3, t4, t5, t6, t7, t8, t9, v1, v2, v3, v4, v5, v6, v7, @@ -669,7 +669,7 @@ f"> */ /* Function Body */ if (tau->r == 0.f && tau->i == 0.f) { - return 0; + return; } if (lsame_(side, "L")) { @@ -2593,7 +2593,7 @@ f"> */ goto L410; } L410: - return 0; + return; /* End of CLARFX */ diff --git a/lapack-netlib/SRC/clarfy.c b/lapack-netlib/SRC/clarfy.c index 1bda7663e4..15a83738d0 100644 --- a/lapack-netlib/SRC/clarfy.c +++ b/lapack-netlib/SRC/clarfy.c @@ -620,7 +620,7 @@ static integer c__1 = 1; /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clarfy_(char *uplo, integer *n, complex *v, integer * +/* Subroutine */ void clarfy_(char *uplo, integer *n, complex *v, integer * incv, complex *tau, complex *c__, integer *ldc, complex *work) { /* System generated locals */ @@ -628,12 +628,12 @@ static integer c__1 = 1; complex q__1, q__2, q__3, q__4; /* Local variables */ - extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex * + extern /* Subroutine */ void cher2_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, integer *); complex alpha; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); @@ -657,7 +657,7 @@ static integer c__1 = 1; /* Function Body */ if (tau->r == 0.f && tau->i == 0.f) { - return 0; + return; } /* Form w:= C * v */ @@ -679,7 +679,7 @@ static integer c__1 = 1; q__1.r = -tau->r, q__1.i = -tau->i; cher2_(uplo, n, &q__1, &v[1], incv, &work[1], &c__1, &c__[c_offset], ldc); - return 0; + return; /* End of CLARFY */ diff --git a/lapack-netlib/SRC/clargv.c b/lapack-netlib/SRC/clargv.c index 5b60fe52ad..6ed8623bec 100644 --- a/lapack-netlib/SRC/clargv.c +++ b/lapack-netlib/SRC/clargv.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clargv_(integer *n, complex *x, integer *incx, complex * +/* Subroutine */ void clargv_(integer *n, complex *x, integer *incx, complex * y, integer *incy, real *c__, integer *incc) { /* System generated locals */ @@ -869,7 +869,7 @@ f"> */ ix += *incx; /* L60: */ } - return 0; + return; /* End of CLARGV */ diff --git a/lapack-netlib/SRC/clarnv.c b/lapack-netlib/SRC/clarnv.c index c079633c68..aafd0ee707 100644 --- a/lapack-netlib/SRC/clarnv.c +++ b/lapack-netlib/SRC/clarnv.c @@ -608,7 +608,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarnv_(integer *idist, integer *iseed, integer *n, +/* Subroutine */ void clarnv_(integer *idist, integer *iseed, integer *n, complex *x) { /* System generated locals */ @@ -620,7 +620,7 @@ f"> */ integer i__; real u[128]; integer il, iv; - extern /* Subroutine */ int slaruv_(integer *, integer *, real *); + extern /* Subroutine */ void slaruv_(integer *, integer *, real *); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -723,7 +723,7 @@ f"> */ } /* L60: */ } - return 0; + return; /* End of CLARNV */ diff --git a/lapack-netlib/SRC/clarrv.c b/lapack-netlib/SRC/clarrv.c index 59c83dec5c..5fb8764eb3 100644 --- a/lapack-netlib/SRC/clarrv.c +++ b/lapack-netlib/SRC/clarrv.c @@ -799,7 +799,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int clarrv_(integer *n, real *vl, real *vu, real *d__, real * +/* Subroutine */ void clarrv_(integer *n, real *vl, real *vu, real *d__, real * l, real *pivmin, integer *isplit, integer *m, integer *dol, integer * dou, real *minrgp, real *rtol1, real *rtol2, real *w, real *werr, real *wgap, integer *iblock, integer *indexw, real *gers, complex * @@ -828,11 +828,11 @@ f"> */ logical eskip; real right; integer nclus, zfrom; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real rqtol; integer iindc1, iindc2, indin1, indin2; - extern /* Subroutine */ int clar1v_(integer *, integer *, integer *, real + extern /* Subroutine */ void clar1v_(integer *, integer *, integer *, real *, real *, real *, real *, real *, real *, real *, complex *, logical *, integer *, real *, real *, integer *, integer *, real * , real *, real *, real *); @@ -854,12 +854,12 @@ f"> */ real savgap; integer ndepth; real ssigma; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); logical usedbs; integer iindwk, offset; real gaptol; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); integer newcls, oldfst, indwrk, windex, oldlst; logical usedrq; @@ -869,7 +869,7 @@ f"> */ real nrminv, rqcorr; logical tryrqc; integer isupmx; - extern /* Subroutine */ int slarrb_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarrb_(integer *, real *, real *, integer *, integer *, real *, real *, integer *, real *, real *, real *, real *, integer *, real *, real *, integer *, integer *), slarrf_( integer *, real *, real *, real *, integer *, integer *, real *, @@ -911,7 +911,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0 || *m <= 0) { - return 0; + return; } /* The first N entries of WORK are reserved for the eigenvalues */ @@ -1064,7 +1064,7 @@ f"> */ /* This is a crude protection against infinitely deep trees */ if (ndepth > *m) { *info = -2; - return 0; + return; } /* breadth first processing of the current level of the representation */ /* tree: OLDNCL = number of clusters on current level */ @@ -1150,7 +1150,7 @@ f"> */ iindwk], pivmin, &spdiam, &in, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* We also recompute the extremal gaps. W holds all eigenvalues */ /* of the unshifted matrix and must be used for computation */ @@ -1337,7 +1337,7 @@ f"> */ iwork[k] = newlst; } else { *info = -2; - return 0; + return; } } else { @@ -1441,7 +1441,7 @@ f"> */ iindwk], pivmin, &spdiam, &itmp1, &iinfo); if (iinfo != 0) { *info = -3; - return 0; + return; } lambda = work[windex]; /* Reset twist index from inaccurate LAMBDA to */ @@ -1536,7 +1536,7 @@ f"> */ goto L120; } else { *info = 5; - return 0; + return; } } else { stp2ii = FALSE_; @@ -1631,7 +1631,7 @@ f"> */ ; } - return 0; + return; /* End of CLARRV */ diff --git a/lapack-netlib/SRC/clarscl2.c b/lapack-netlib/SRC/clarscl2.c index 7f76979e52..920ee8e04b 100644 --- a/lapack-netlib/SRC/clarscl2.c +++ b/lapack-netlib/SRC/clarscl2.c @@ -600,7 +600,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int clarscl2_(integer *m, integer *n, real *d__, complex *x, +/* Subroutine */ void clarscl2_(integer *m, integer *n, real *d__, complex *x, integer *ldx) { /* System generated locals */ @@ -638,6 +638,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__3].r = q__1.r, x[i__3].i = q__1.i; } } - return 0; + return; } /* clarscl2_ */ diff --git a/lapack-netlib/SRC/clarscl2.f b/lapack-netlib/SRC/clarscl2.f index 26b028dbba..f4e68523b2 100644 --- a/lapack-netlib/SRC/clarscl2.f +++ b/lapack-netlib/SRC/clarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> CLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> CLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the REAL diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/clartg.c b/lapack-netlib/SRC/clartg.c index c91de371fe..8d318f5cdc 100644 --- a/lapack-netlib/SRC/clartg.c +++ b/lapack-netlib/SRC/clartg.c @@ -614,7 +614,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clartg_(complex *f, complex *g, real *cs, complex *sn, +/* Subroutine */ void clartg_(complex *f, complex *g, real *cs, complex *sn, complex *r__) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ *cs = 1.f; sn->r = 0.f, sn->i = 0.f; r__->r = f->r, r__->i = f->i; - return 0; + return; } L20: --count; @@ -722,7 +722,7 @@ f"> */ r__2 = -r_imag(&gs) / d__; q__1.r = r__1, q__1.i = r__2; sn->r = q__1.r, sn->i = q__1.i; - return 0; + return; } r__1 = fs.r; r__2 = r_imag(&fs); @@ -811,7 +811,7 @@ f"> */ } } } - return 0; + return; /* End of CLARTG */ diff --git a/lapack-netlib/SRC/clartg.f90 b/lapack-netlib/SRC/clartg.f90 index 13a629a34e..6231f85203 100644 --- a/lapack-netlib/SRC/clartg.f90 +++ b/lapack-netlib/SRC/clartg.f90 @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,19 +38,20 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be -!> identical to those returned by CLARTG. +!> identical to those returned by SLARTG. !> !> The algorithm used to compute these quantities incorporates scaling !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine CROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine CROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine CLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, one=>sone, two=>stwo, czero, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine CLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine CLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -149,30 +150,43 @@ subroutine CLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -181,32 +195,51 @@ subroutine CLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,19 +247,43 @@ subroutine CLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return diff --git a/lapack-netlib/SRC/clartv.c b/lapack-netlib/SRC/clartv.c index f59b7feba0..04c23485d8 100644 --- a/lapack-netlib/SRC/clartv.c +++ b/lapack-netlib/SRC/clartv.c @@ -617,7 +617,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clartv_(integer *n, complex *x, integer *incx, complex * +/* Subroutine */ void clartv_(integer *n, complex *x, integer *incx, complex * y, integer *incy, real *c__, complex *s, integer *incc) { /* System generated locals */ @@ -675,7 +675,7 @@ f"> */ ic += *incc; /* L10: */ } - return 0; + return; /* End of CLARTV */ diff --git a/lapack-netlib/SRC/clarz.c b/lapack-netlib/SRC/clarz.c index 986450688c..d50c2e74f4 100644 --- a/lapack-netlib/SRC/clarz.c +++ b/lapack-netlib/SRC/clarz.c @@ -661,7 +661,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarz_(char *side, integer *m, integer *n, integer *l, +/* Subroutine */ void clarz_(char *side, integer *m, integer *n, integer *l, complex *v, integer *incv, complex *tau, complex *c__, integer *ldc, complex *work) { @@ -670,12 +670,12 @@ static integer c__1 = 1; complex q__1; /* Local variables */ - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cgemv_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgeru_(integer *, integer *, complex *, + extern /* Subroutine */ void cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, @@ -760,7 +760,7 @@ static integer c__1 = 1; } - return 0; + return; /* End of CLARZ */ diff --git a/lapack-netlib/SRC/clarzb.c b/lapack-netlib/SRC/clarzb.c index 5f8d7fa4f6..f20a1fb2e5 100644 --- a/lapack-netlib/SRC/clarzb.c +++ b/lapack-netlib/SRC/clarzb.c @@ -696,7 +696,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarzb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void clarzb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, complex *v, integer *ldv, complex *t, integer *ldt, complex *c__, integer *ldc, complex *work, integer *ldwork) @@ -708,15 +708,16 @@ f"> */ /* Local variables */ integer info, i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), clacgv_(integer *, - complex *, integer *), xerbla_(char *, integer *, ftnlen); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); char transt[1]; @@ -747,7 +748,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } /* Check for currently supported options */ @@ -761,7 +762,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("CLARZB", &i__1, (ftnlen)6); - return 0; + return; } if (lsame_(trans, "N")) { @@ -902,7 +903,7 @@ f"> */ } - return 0; + return; /* End of CLARZB */ diff --git a/lapack-netlib/SRC/clarzt.c b/lapack-netlib/SRC/clarzt.c index cf3c6a1e77..e0665d0814 100644 --- a/lapack-netlib/SRC/clarzt.c +++ b/lapack-netlib/SRC/clarzt.c @@ -699,7 +699,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clarzt_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void clarzt_(char *direct, char *storev, integer *n, integer * k, complex *v, integer *ldv, complex *tau, complex *t, integer *ldt) { /* System generated locals */ @@ -708,13 +708,13 @@ f"> */ /* Local variables */ integer info, i__, j; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *, - complex *, integer *, complex *, integer *), clacgv_(integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void ctrmv_(char *, char *, char *, integer *, + complex *, integer *, complex *, integer *), clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -747,7 +747,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("CLARZT", &i__1, (ftnlen)6); - return 0; + return; } for (i__ = *k; i__ >= 1; --i__) { @@ -792,7 +792,7 @@ f"> */ } /* L20: */ } - return 0; + return; /* End of CLARZT */ diff --git a/lapack-netlib/SRC/clascl.c b/lapack-netlib/SRC/clascl.c index 4d1e4c31c4..892c219081 100644 --- a/lapack-netlib/SRC/clascl.c +++ b/lapack-netlib/SRC/clascl.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clascl_(char *type__, integer *kl, integer *ku, real * +/* Subroutine */ void clascl_(char *type__, integer *kl, integer *ku, real * cfrom, real *cto, integer *m, integer *n, complex *a, integer *lda, integer *info) { @@ -745,13 +745,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLASCL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } /* Get machine parameters */ @@ -935,7 +935,7 @@ f"> */ goto L10; } - return 0; + return; /* End of CLASCL */ diff --git a/lapack-netlib/SRC/clascl.f b/lapack-netlib/SRC/clascl.f index 399af23a4b..f9aace0bc4 100644 --- a/lapack-netlib/SRC/clascl.f +++ b/lapack-netlib/SRC/clascl.f @@ -272,6 +272,8 @@ SUBROUTINE CLASCL( TYPE, KL, KU, CFROM, CTO, M, N, A, LDA, INFO ) ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/clascl2.c b/lapack-netlib/SRC/clascl2.c index 06ee8a443a..7a1621463d 100644 --- a/lapack-netlib/SRC/clascl2.c +++ b/lapack-netlib/SRC/clascl2.c @@ -600,7 +600,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int clascl2_(integer *m, integer *n, real *d__, complex *x, +/* Subroutine */ void clascl2_(integer *m, integer *n, real *d__, complex *x, integer *ldx) { /* System generated locals */ @@ -638,6 +638,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__3].r = q__1.r, x[i__3].i = q__1.i; } } - return 0; + return; } /* clascl2_ */ diff --git a/lapack-netlib/SRC/clascl2.f b/lapack-netlib/SRC/clascl2.f index 2ae27975c5..882273b5e2 100644 --- a/lapack-netlib/SRC/clascl2.f +++ b/lapack-netlib/SRC/clascl2.f @@ -1,4 +1,4 @@ -*> \brief \b CLASCL2 performs diagonal scaling on a vector. +*> \brief \b CLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,9 +34,9 @@ *> *> \verbatim *> -*> CLASCL2 performs a diagonal scaling on a vector: +*> CLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x -*> where the diagonal REAL matrix D is stored as a vector. +*> where the diagonal REAL matrix D is stored as a matrix. *> *> Eventually to be replaced by BLAS_cge_diag_scale in the new BLAS *> standard. @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/claset.c b/lapack-netlib/SRC/claset.c index 8372580c88..a96a5b4c92 100644 --- a/lapack-netlib/SRC/claset.c +++ b/lapack-netlib/SRC/claset.c @@ -616,7 +616,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int claset_(char *uplo, integer *m, integer *n, complex * +/* Subroutine */ void claset_(char *uplo, integer *m, integer *n, complex * alpha, complex *beta, complex *a, integer *lda) { /* System generated locals */ @@ -711,7 +711,7 @@ f"> */ } } - return 0; + return; /* End of CLASET */ diff --git a/lapack-netlib/SRC/clasr.c b/lapack-netlib/SRC/clasr.c index 7aebcaa310..75e91ecd01 100644 --- a/lapack-netlib/SRC/clasr.c +++ b/lapack-netlib/SRC/clasr.c @@ -709,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clasr_(char *side, char *pivot, char *direct, integer *m, +/* Subroutine */ void clasr_(char *side, char *pivot, char *direct, integer *m, integer *n, real *c__, real *s, complex *a, integer *lda) { /* System generated locals */ @@ -762,13 +762,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("CLASR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (lsame_(side, "L")) { @@ -1160,7 +1160,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of CLASR */ diff --git a/lapack-netlib/SRC/classq.c b/lapack-netlib/SRC/classq.c index 539df4b59b..346158f3b0 100644 --- a/lapack-netlib/SRC/classq.c +++ b/lapack-netlib/SRC/classq.c @@ -615,7 +615,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int classq_(integer *n, complex *x, integer *incx, real * +/* Subroutine */ void classq_(integer *n, complex *x, integer *incx, real * scale, real *sumsq) { /* System generated locals */ @@ -676,7 +676,7 @@ f"> */ } } - return 0; + return; /* End of CLASSQ */ diff --git a/lapack-netlib/SRC/claswlq.c b/lapack-netlib/SRC/claswlq.c index 3048880cef..562dc4a65d 100644 --- a/lapack-netlib/SRC/claswlq.c +++ b/lapack-netlib/SRC/claswlq.c @@ -671,7 +671,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claswlq_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void claswlq_(integer *m, integer *n, integer *mb, integer * nb, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *lwork, integer *info) { @@ -680,7 +680,8 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgelqt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgelqt_( integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *), ctplqt_(integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, @@ -737,15 +738,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("CLASWLQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -753,7 +754,7 @@ static integer c__0 = 0; if (*m >= *n || *nb <= *m || *nb >= *n) { cgelqt_(m, n, mb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*n - *m) % (*nb - *m); @@ -786,7 +787,7 @@ static integer c__0 = 0; i__2 = *m * *mb; work[1].r = (real) i__2, work[1].i = 0.f; - return 0; + return; /* End of CLASWLQ */ diff --git a/lapack-netlib/SRC/claswp.c b/lapack-netlib/SRC/claswp.c index 60148c4fdb..404e6aad50 100644 --- a/lapack-netlib/SRC/claswp.c +++ b/lapack-netlib/SRC/claswp.c @@ -624,7 +624,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int claswp_(integer *n, complex *a, integer *lda, integer * +/* Subroutine */ void claswp_(integer *n, complex *a, integer *lda, integer * k1, integer *k2, integer *ipiv, integer *incx) { /* System generated locals */ @@ -665,7 +665,7 @@ f"> */ i2 = *k1; inc = -1; } else { - return 0; + return; } n32 = *n / 32 << 5; @@ -722,7 +722,7 @@ f"> */ } } - return 0; + return; /* End of CLASWP */ diff --git a/lapack-netlib/SRC/clasyf.c b/lapack-netlib/SRC/clasyf.c index 9faf598024..df3a0d9f06 100644 --- a/lapack-netlib/SRC/clasyf.c +++ b/lapack-netlib/SRC/clasyf.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int clasyf_(char *uplo, integer *n, integer *nb, integer *kb, +/* Subroutine */ void clasyf_(char *uplo, integer *n, integer *nb, integer *kb, complex *a, integer *lda, integer *ipiv, complex *w, integer *ldw, integer *info) { @@ -705,12 +705,12 @@ f"> */ integer imax, jmax, j, k; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, @@ -1538,7 +1538,7 @@ f"> */ *kb = k - 1; } - return 0; + return; /* End of CLASYF */ diff --git a/lapack-netlib/SRC/clasyf_aa.c b/lapack-netlib/SRC/clasyf_aa.c index 64d40af935..6f58989613 100644 --- a/lapack-netlib/SRC/clasyf_aa.c +++ b/lapack-netlib/SRC/clasyf_aa.c @@ -659,7 +659,7 @@ aa.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int clasyf_aa_(char *uplo, integer *j1, integer *m, integer +/* Subroutine */ void clasyf_aa_(char *uplo, integer *j1, integer *m, integer *nb, complex *a, integer *lda, integer *ipiv, complex *h__, integer * ldh, complex *work) { @@ -670,10 +670,10 @@ aa.f"> */ /* Local variables */ integer j, k; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, @@ -681,7 +681,7 @@ aa.f"> */ integer *, complex *, integer *); integer i1, k1, i2, mj; extern integer icamax_(integer *, complex *, integer *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); complex piv; @@ -1076,7 +1076,7 @@ aa.f"> */ L40: ; } - return 0; + return; /* End of CLASYF_AA */ diff --git a/lapack-netlib/SRC/clasyf_rk.c b/lapack-netlib/SRC/clasyf_rk.c index ba6f339f20..f72b1c105d 100644 --- a/lapack-netlib/SRC/clasyf_rk.c +++ b/lapack-netlib/SRC/clasyf_rk.c @@ -776,7 +776,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int clasyf_rk_(char *uplo, integer *n, integer *nb, integer +/* Subroutine */ void clasyf_rk_(char *uplo, integer *n, integer *nb, integer *kb, complex *a, integer *lda, complex *e, integer *ipiv, complex *w, integer *ldw, integer *info) { @@ -790,19 +790,19 @@ rk.f"> */ integer imax, jmax, j, k, p; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real sfmin; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer itemp; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; real stemp; @@ -1710,7 +1710,7 @@ rk.f"> */ } - return 0; + return; /* End of CLASYF_RK */ diff --git a/lapack-netlib/SRC/clasyf_rook.c b/lapack-netlib/SRC/clasyf_rook.c index 1a20913e89..a4613912f8 100644 --- a/lapack-netlib/SRC/clasyf_rook.c +++ b/lapack-netlib/SRC/clasyf_rook.c @@ -698,7 +698,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int clasyf_rook_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void clasyf_rook_(char *uplo, integer *n, integer *nb, integer *kb, complex *a, integer *lda, integer *ipiv, complex *w, integer *ldw, integer *info) { @@ -712,19 +712,19 @@ rook.f"> */ integer imax, jmax, j, k, p; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real sfmin; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer itemp; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; real stemp; @@ -1636,7 +1636,7 @@ rook.f"> */ *kb = k - 1; } - return 0; + return; /* End of CLASYF_ROOK */ diff --git a/lapack-netlib/SRC/clatbs.c b/lapack-netlib/SRC/clatbs.c index 67f3a236bc..168aca749d 100644 --- a/lapack-netlib/SRC/clatbs.c +++ b/lapack-netlib/SRC/clatbs.c @@ -756,7 +756,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatbs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void clatbs_(char *uplo, char *trans, char *diag, char * normin, integer *n, integer *kd, complex *ab, integer *ldab, complex * x, real *scale, real *cnorm, integer *info) { @@ -776,24 +776,25 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tscal; complex uscal; integer jlast; extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer *, complex *, integer *); complex csumj; - extern /* Subroutine */ int ctbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctbsv_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer * , complex *, integer *); logical upper; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real xj; extern integer icamax_(integer *, complex *, integer *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; extern integer isamax_(integer *, real *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -848,13 +849,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLATBS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1749,7 +1750,7 @@ f"> */ sscal_(n, &r__1, &cnorm[1], &c__1); } - return 0; + return; /* End of CLATBS */ diff --git a/lapack-netlib/SRC/clatbs.f b/lapack-netlib/SRC/clatbs.f index 606f963d38..97abcadce1 100644 --- a/lapack-netlib/SRC/clatbs.f +++ b/lapack-netlib/SRC/clatbs.f @@ -278,7 +278,7 @@ SUBROUTINE CLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, $ CDOTU, CLADIV * .. * .. External Subroutines .. - EXTERNAL CAXPY, CSSCAL, CTBSV, SLABAD, SSCAL, XERBLA + EXTERNAL CAXPY, CSSCAL, CTBSV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -324,17 +324,14 @@ SUBROUTINE CLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = SLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / SLAMCH( 'Precision' ) + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/clatdf.c b/lapack-netlib/SRC/clatdf.c index 4a7990b6af..68c1d63efd 100644 --- a/lapack-netlib/SRC/clatdf.c +++ b/lapack-netlib/SRC/clatdf.c @@ -685,7 +685,7 @@ f"> */ /* > 1995. */ /* ===================================================================== */ -/* Subroutine */ int clatdf_(integer *ijob, integer *n, complex *z__, integer +/* Subroutine */ void clatdf_(integer *ijob, integer *n, complex *z__, integer *ldz, complex *rhs, real *rdsum, real *rdscal, integer *ipiv, integer *jpiv) { @@ -697,25 +697,26 @@ f"> */ integer info; complex temp, work[8]; integer i__, j, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); real scale; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); complex pmone; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); real rtemp, sminu, rwork[2], splus; - extern /* Subroutine */ int cgesc2_(integer *, complex *, integer *, + extern /* Subroutine */ void cgesc2_(integer *, complex *, integer *, complex *, integer *, integer *, real *); complex bm, bp; - extern /* Subroutine */ int cgecon_(char *, integer *, complex *, integer + extern /* Subroutine */ void cgecon_(char *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *); complex xm[2], xp[2]; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real - *, real *), claswp_(integer *, complex *, integer *, integer *, + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real + *, real *); + extern int claswp_(integer *, complex *, integer *, integer *, integer *, integer *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -877,7 +878,7 @@ f"> */ /* Compute the sum of squares */ classq_(n, &rhs[1], &c__1, rdscal, rdsum); - return 0; + return; } /* ENTRY IJOB = 2 */ @@ -909,7 +910,7 @@ f"> */ /* Compute the sum of squares */ classq_(n, &rhs[1], &c__1, rdscal, rdsum); - return 0; + return; /* End of CLATDF */ diff --git a/lapack-netlib/SRC/clatps.c b/lapack-netlib/SRC/clatps.c index 1fb11c9667..64563abbd9 100644 --- a/lapack-netlib/SRC/clatps.c +++ b/lapack-netlib/SRC/clatps.c @@ -744,7 +744,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatps_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void clatps_(char *uplo, char *trans, char *diag, char * normin, integer *n, complex *ap, complex *x, real *scale, real *cnorm, integer *info) { @@ -764,17 +764,17 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tscal; complex uscal; integer jlast; extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer *, complex *, integer *); complex csumj; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *), slabad_( real *, real *); integer ip; @@ -782,8 +782,9 @@ f"> */ extern integer icamax_(integer *, complex *, integer *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; extern integer isamax_(integer *, real *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -832,13 +833,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLATPS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1713,7 +1714,7 @@ f"> */ sscal_(n, &r__1, &cnorm[1], &c__1); } - return 0; + return; /* End of CLATPS */ diff --git a/lapack-netlib/SRC/clatrd.c b/lapack-netlib/SRC/clatrd.c index 0def564ad4..f1c30202be 100644 --- a/lapack-netlib/SRC/clatrd.c +++ b/lapack-netlib/SRC/clatrd.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatrd_(char *uplo, integer *n, integer *nb, complex *a, +/* Subroutine */ void clatrd_(char *uplo, integer *n, integer *nb, complex *a, integer *lda, real *e, complex *tau, complex *w, integer *ldw) { /* System generated locals */ @@ -726,17 +726,17 @@ f"> */ /* Local variables */ integer i__; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), chemv_(char *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), clarfg_(integer *, complex *, complex *, integer *, complex *), clacgv_(integer *, complex *, integer *); @@ -766,7 +766,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (lsame_(uplo, "U")) { @@ -973,7 +973,7 @@ f"> */ } } - return 0; + return; /* End of CLATRD */ diff --git a/lapack-netlib/SRC/clatrs.c b/lapack-netlib/SRC/clatrs.c index b68fee3531..61e43419d1 100644 --- a/lapack-netlib/SRC/clatrs.c +++ b/lapack-netlib/SRC/clatrs.c @@ -753,7 +753,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatrs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void clatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, complex *a, integer *lda, complex *x, real *scale, real *cnorm, integer *info) { @@ -773,24 +773,25 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tscal; complex uscal; integer jlast; extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer *, complex *, integer *); complex csumj; - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctrsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrsv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *), slabad_(real *, real *); real xj; extern integer icamax_(integer *, complex *, integer *); extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; extern integer isamax_(integer *, real *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -843,13 +844,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLATRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1702,7 +1703,7 @@ f"> */ sscal_(n, &r__1, &cnorm[1], &c__1); } - return 0; + return; /* End of CLATRS */ diff --git a/lapack-netlib/SRC/clatrs.f b/lapack-netlib/SRC/clatrs.f index 946ab80689..91334b7066 100644 --- a/lapack-netlib/SRC/clatrs.f +++ b/lapack-netlib/SRC/clatrs.f @@ -274,7 +274,7 @@ SUBROUTINE CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, $ CDOTU, CLADIV * .. * .. External Subroutines .. - EXTERNAL CAXPY, CSSCAL, CTRSV, SLABAD, SSCAL, XERBLA + EXTERNAL CAXPY, CSSCAL, CTRSV, SSCAL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL @@ -318,17 +318,14 @@ SUBROUTINE CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = SLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / SLAMCH( 'Precision' ) + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -360,8 +357,74 @@ SUBROUTINE CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE - TSCAL = HALF / ( SMLNUM*TMAX ) - CALL SSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.SLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = HALF / ( SMLNUM*TMAX ) + CALL SSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be +* represented as a floating-point number. Find the +* maximum offdiagonal absolute value +* max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is +* not +/- Infinity, use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + DO I = 1, J - 1 + TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ), + $ ABS( AIMAG(A ( I, J ) ) ) ) + END DO + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + DO I = J + 1, N + TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ), + $ ABS( AIMAG(A ( I, J ) ) ) ) + END DO + END DO + END IF +* + IF( TMAX.LE.SLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm of each column without +* introducing Infinity in the summation. + TSCAL = TWO * TSCAL + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + END IF + TSCAL = TSCAL * HALF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point +* entry. Rely on TRSV to propagate Inf and NaN. + CALL CTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c new file mode 100644 index 0000000000..a53483ff00 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.c @@ -0,0 +1,1282 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void clatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, complex *a, integer *lda, complex * + x, integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + complex q__1; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + real rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + integer lscale; + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ void clatrs_(char *, char *, char *, char *, + integer *, complex *, integer *, complex *, real *, real *, + integer *); + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLATRS3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + clatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = clange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + clatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + clatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = clange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + csscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0.f, x[i__6].i = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = clange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + csscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + csscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__6, &i__7, &i__8, &q__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("T", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return; + +/* End of CLATRS3 */ + +} /* clatrs3_ */ + diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f new file mode 100644 index 0000000000..a902f1ed01 --- /dev/null +++ b/lapack-netlib/SRC/clatrs3.f @@ -0,0 +1,666 @@ +*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( LDX, * ) + REAL CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CZERO, CONE + PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ) ) + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, CLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, CLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CLATRS, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of CLATRS3 +* + END diff --git a/lapack-netlib/SRC/clatrz.c b/lapack-netlib/SRC/clatrz.c index df017628d4..2447fbac1a 100644 --- a/lapack-netlib/SRC/clatrz.c +++ b/lapack-netlib/SRC/clatrz.c @@ -649,7 +649,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatrz_(integer *m, integer *n, integer *l, complex *a, +/* Subroutine */ void clatrz_(integer *m, integer *n, integer *l, complex *a, integer *lda, complex *tau, complex *work) { /* System generated locals */ @@ -659,7 +659,7 @@ f"> */ /* Local variables */ integer i__; complex alpha; - extern /* Subroutine */ int clarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void clarz_(char *, integer *, integer *, integer * , complex *, integer *, complex *, complex *, integer *, complex * ), clarfg_(integer *, complex *, complex *, integer *, complex *), clacgv_(integer *, complex *, integer *); @@ -685,7 +685,7 @@ f"> */ /* Function Body */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -693,7 +693,7 @@ f"> */ tau[i__2].r = 0.f, tau[i__2].i = 0.f; /* L10: */ } - return 0; + return; } for (i__ = *m; i__ >= 1; --i__) { @@ -725,7 +725,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of CLATRZ */ diff --git a/lapack-netlib/SRC/clatsqr.c b/lapack-netlib/SRC/clatsqr.c index 08235f02e6..d88e0da3a2 100644 --- a/lapack-netlib/SRC/clatsqr.c +++ b/lapack-netlib/SRC/clatsqr.c @@ -673,7 +673,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void clatsqr_(integer *m, integer *n, integer *mb, integer * nb, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *lwork, integer *info) { @@ -682,7 +682,8 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgeqrt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgeqrt_( integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *), ctpqrt_(integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, @@ -738,15 +739,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("CLATSQR", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -754,7 +755,7 @@ static integer c__0 = 0; if (*mb <= *n || *mb >= *m) { cgeqrt_(m, n, nb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*m - *n) % (*mb - *n); ii = *m - kk + 1; @@ -786,7 +787,7 @@ static integer c__0 = 0; i__2 = *n * *nb; work[1].r = (real) i__2, work[1].i = 0.f; - return 0; + return; /* End of CLATSQR */ diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp.c b/lapack-netlib/SRC/claunhr_col_getrfnp.c index d6cb51abec..70f8c44dc3 100644 --- a/lapack-netlib/SRC/claunhr_col_getrfnp.c +++ b/lapack-netlib/SRC/claunhr_col_getrfnp.c @@ -661,7 +661,7 @@ _col_getrfnp.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int claunhr_col_getrfnp_(integer *m, integer *n, complex *a, +/* Subroutine */ void claunhr_col_getrfnp_(integer *m, integer *n, complex *a, integer *lda, complex *d__, integer *info) { /* System generated locals */ @@ -669,14 +669,14 @@ _col_getrfnp.f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int claunhr_col_getrfnp2_(integer *, integer *, + extern /* Subroutine */ void claunhr_col_getrfnp2_(integer *, integer *, complex *, integer *, complex *, integer *); integer j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); integer iinfo; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer jb, nb; @@ -714,13 +714,13 @@ _col_getrfnp.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAUNHR_COL_GETRFNP", &i__1, (ftnlen)19); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -772,7 +772,7 @@ _col_getrfnp.f"> */ } } } - return 0; + return; /* End of CLAUNHR_COL_GETRFNP */ diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp2.c b/lapack-netlib/SRC/claunhr_col_getrfnp2.c index e79b4c8e3a..65fdc0027d 100644 --- a/lapack-netlib/SRC/claunhr_col_getrfnp2.c +++ b/lapack-netlib/SRC/claunhr_col_getrfnp2.c @@ -682,7 +682,7 @@ _col_getrfnp2.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int claunhr_col_getrfnp2_(integer *m, integer *n, complex * +/* Subroutine */ void claunhr_col_getrfnp2_(integer *m, integer *n, complex * a, integer *lda, complex *d__, integer *info) { /* System generated locals */ @@ -692,13 +692,13 @@ _col_getrfnp2.f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); integer iinfo; real sfmin; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer n1, n2; @@ -735,13 +735,13 @@ _col_getrfnp2.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAUNHR_COL_GETRFNP2", &i__1, (ftnlen)20); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } if (*m == 1) { @@ -844,7 +844,7 @@ _col_getrfnp2.f"> */ lda, &d__[n1 + 1], &iinfo); } - return 0; + return; /* End of CLAUNHR_COL_GETRFNP2 */ diff --git a/lapack-netlib/SRC/clauu2.c b/lapack-netlib/SRC/clauu2.c index f79cdb1f06..2be32a3c06 100644 --- a/lapack-netlib/SRC/clauu2.c +++ b/lapack-netlib/SRC/clauu2.c @@ -617,7 +617,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clauu2_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void clauu2_(char *uplo, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -630,13 +630,13 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); logical upper; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real aii; @@ -669,13 +669,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAUU2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -741,7 +741,7 @@ f"> */ } } - return 0; + return; /* End of CLAUU2 */ diff --git a/lapack-netlib/SRC/clauum.c b/lapack-netlib/SRC/clauum.c index d2a0b0419e..45a1de4a93 100644 --- a/lapack-netlib/SRC/clauum.c +++ b/lapack-netlib/SRC/clauum.c @@ -619,7 +619,7 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int clauum_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void clauum_(char *uplo, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -627,17 +627,17 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int clauu2_(char *, integer *, complex *, integer + extern /* Subroutine */ void clauu2_(char *, integer *, complex *, integer *, integer *); integer ib, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CLAUUM", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -758,7 +758,7 @@ f"> */ } } - return 0; + return; /* End of CLAUUM */ diff --git a/lapack-netlib/SRC/cpbcon.c b/lapack-netlib/SRC/cpbcon.c index 05da914200..e46ce643af 100644 --- a/lapack-netlib/SRC/cpbcon.c +++ b/lapack-netlib/SRC/cpbcon.c @@ -645,7 +645,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpbcon_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void cpbcon_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *anorm, real *rcond, complex *work, real *rwork, integer *info) { @@ -659,19 +659,19 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ix; extern integer icamax_(integer *, complex *, integer *); real scalel; extern real slamch_(char *); - extern /* Subroutine */ int clatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatbs_(char *, char *, char *, char *, integer *, integer *, complex *, integer *, complex *, real *, real *, integer *); real scaleu; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + extern /* Subroutine */ void csrscl_(integer *, real *, complex *, integer *); char normin[1]; real smlnum; @@ -712,7 +712,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -720,9 +720,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -783,7 +783,7 @@ f"> */ L20: - return 0; + return; /* End of CPBCON */ diff --git a/lapack-netlib/SRC/cpbequ.c b/lapack-netlib/SRC/cpbequ.c index b46fcaa066..0d69760da2 100644 --- a/lapack-netlib/SRC/cpbequ.c +++ b/lapack-netlib/SRC/cpbequ.c @@ -639,7 +639,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpbequ_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void cpbequ_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -686,7 +686,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -694,7 +694,7 @@ f"> */ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } if (upper) { @@ -733,7 +733,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L20: */ } @@ -752,7 +752,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of CPBEQU */ diff --git a/lapack-netlib/SRC/cpbrfs.c b/lapack-netlib/SRC/cpbrfs.c index cf61e5dc95..fccc1f3d4d 100644 --- a/lapack-netlib/SRC/cpbrfs.c +++ b/lapack-netlib/SRC/cpbrfs.c @@ -702,7 +702,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpbrfs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void cpbrfs_(char *uplo, integer *n, integer *kd, integer * nrhs, complex *ab, integer *ldab, complex *afb, integer *ldafb, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real * berr, complex *work, real *rwork, integer *info) @@ -718,23 +718,24 @@ f"> */ real safe1, safe2; integer i__, j, k, l; real s; - extern /* Subroutine */ int chbmv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void chbmv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cpbtrs_( char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real lstres, eps; @@ -792,7 +793,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -804,7 +805,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1048,7 +1049,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CPBRFS */ diff --git a/lapack-netlib/SRC/cpbstf.c b/lapack-netlib/SRC/cpbstf.c index 0fedc15380..f0a88e4b4b 100644 --- a/lapack-netlib/SRC/cpbstf.c +++ b/lapack-netlib/SRC/cpbstf.c @@ -667,7 +667,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpbstf_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void cpbstf_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -675,15 +675,15 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, integer *, complex *, integer *); integer j, m; extern logical lsame_(char *, char *); logical upper; integer km; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ajj; integer kld; @@ -719,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBSTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -871,11 +871,11 @@ f"> */ /* L40: */ } } - return 0; + return; L50: *info = j; - return 0; + return; /* End of CPBSTF */ diff --git a/lapack-netlib/SRC/cpbsv.c b/lapack-netlib/SRC/cpbsv.c index 9b1a3cd0d5..83b0f63b18 100644 --- a/lapack-netlib/SRC/cpbsv.c +++ b/lapack-netlib/SRC/cpbsv.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpbsv_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void cpbsv_(char *uplo, integer *n, integer *kd, integer * nrhs, complex *ab, integer *ldab, complex *b, integer *ldb, integer * info) { @@ -682,7 +682,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpbtrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cpbtrf_( char *, integer *, integer *, complex *, integer *, integer *), cpbtrs_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -724,7 +725,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CPBSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**H*U or A = L*L**H. */ @@ -738,7 +739,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ info); } - return 0; + return; /* End of CPBSV */ diff --git a/lapack-netlib/SRC/cpbsvx.c b/lapack-netlib/SRC/cpbsvx.c index 70337aaa62..49925c13ce 100644 --- a/lapack-netlib/SRC/cpbsvx.c +++ b/lapack-netlib/SRC/cpbsvx.c @@ -853,7 +853,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpbsvx_(char *fact, char *uplo, integer *n, integer *kd, +/* Subroutine */ void cpbsvx_(char *fact, char *uplo, integer *n, integer *kd, integer *nrhs, complex *ab, integer *ldab, complex *afb, integer * ldafb, char *equed, real *s, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, @@ -870,30 +870,31 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); real scond, anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical equil, rcequ, upper; integer j1, j2; extern real clanhb_(char *, char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int claqhb_(char *, integer *, integer *, complex + extern /* Subroutine */ void claqhb_(char *, integer *, integer *, complex *, integer *, real *, real *, real *, char *), cpbcon_(char *, integer *, integer *, complex *, integer *, real * , real *, complex *, real *, integer *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen), cpbequ_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cpbequ_(char *, integer *, integer *, complex *, integer *, real *, real *, real *, integer *), cpbrfs_( char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *); real bignum; - extern /* Subroutine */ int cpbtrf_(char *, integer *, integer *, complex + extern /* Subroutine */ void cpbtrf_(char *, integer *, integer *, complex *, integer *, integer *); integer infequ; - extern /* Subroutine */ int cpbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void cpbtrs_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real smlnum; @@ -993,7 +994,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1064,7 +1065,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -1120,7 +1121,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CPBSVX */ diff --git a/lapack-netlib/SRC/cpbtf2.c b/lapack-netlib/SRC/cpbtf2.c index b14bbb51ee..de89747f9b 100644 --- a/lapack-netlib/SRC/cpbtf2.c +++ b/lapack-netlib/SRC/cpbtf2.c @@ -657,7 +657,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpbtf2_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void cpbtf2_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -665,15 +665,15 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int cher_(char *, integer *, real *, complex *, + extern /* Subroutine */ void cher_(char *, integer *, real *, complex *, integer *, complex *, integer *); integer j; extern logical lsame_(char *, char *); logical upper; integer kn; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ajj; integer kld; @@ -709,13 +709,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -793,11 +793,11 @@ f"> */ /* L20: */ } } - return 0; + return; L30: *info = j; - return 0; + return; /* End of CPBTF2 */ diff --git a/lapack-netlib/SRC/cpbtrf.c b/lapack-netlib/SRC/cpbtrf.c index f44cb5c3fa..b239f20b41 100644 --- a/lapack-netlib/SRC/cpbtrf.c +++ b/lapack-netlib/SRC/cpbtrf.c @@ -660,7 +660,7 @@ f"> */ /* > Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 */ /* ===================================================================== */ -/* Subroutine */ int cpbtrf_(char *uplo, integer *n, integer *kd, complex *ab, +/* Subroutine */ void cpbtrf_(char *uplo, integer *n, integer *kd, complex *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -670,18 +670,19 @@ f"> */ /* Local variables */ complex work[1056] /* was [33][32] */; integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer i2, i3; - extern /* Subroutine */ int cpbtf2_(char *, integer *, integer *, complex - *, integer *, integer *), cpotf2_(char *, integer *, + extern /* Subroutine */ void cpbtf2_(char *, integer *, integer *, complex + *, integer *, integer *); + extern int cpotf2_(char *, integer *, complex *, integer *, integer *); integer ib, nb, ii, jj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -719,13 +720,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1034,10 +1035,10 @@ f"> */ } } } - return 0; + return; L150: - return 0; + return; /* End of CPBTRF */ diff --git a/lapack-netlib/SRC/cpbtrs.c b/lapack-netlib/SRC/cpbtrs.c index c79f6aa5c9..14b1bae6f0 100644 --- a/lapack-netlib/SRC/cpbtrs.c +++ b/lapack-netlib/SRC/cpbtrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpbtrs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void cpbtrs_(char *uplo, integer *n, integer *kd, integer * nrhs, complex *ab, integer *ldab, complex *b, integer *ldb, integer * info) { @@ -644,7 +644,7 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctbsv_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *); logical upper; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -688,13 +688,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -735,7 +735,7 @@ f"> */ } } - return 0; + return; /* End of CPBTRS */ diff --git a/lapack-netlib/SRC/cpftrf.c b/lapack-netlib/SRC/cpftrf.c index e53b4f5f67..67bdc365e0 100644 --- a/lapack-netlib/SRC/cpftrf.c +++ b/lapack-netlib/SRC/cpftrf.c @@ -727,7 +727,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpftrf_(char *transr, char *uplo, integer *n, complex *a, +/* Subroutine */ void cpftrf_(char *transr, char *uplo, integer *n, complex *a, integer *info) { /* System generated locals */ @@ -736,11 +736,11 @@ f"> */ /* Local variables */ integer k; logical normaltransr; - extern /* Subroutine */ int cherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real *, complex *, integer *); extern logical lsame_(char *, char *); logical lower; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer n1, n2; @@ -774,13 +774,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPFTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -821,7 +821,7 @@ f"> */ cpotrf_("L", &n1, a, n, info); if (*info > 0) { - return 0; + return; } ctrsm_("R", "L", "C", "N", &n2, &n1, &c_b1, a, n, &a[n1], n); cherk_("U", "N", &n2, &n1, &c_b15, &a[n1], n, &c_b16, &a[*n], @@ -839,7 +839,7 @@ f"> */ cpotrf_("L", &n1, &a[n2], n, info); if (*info > 0) { - return 0; + return; } ctrsm_("L", "L", "N", "N", &n1, &n2, &c_b1, &a[n2], n, a, n); cherk_("U", "C", &n2, &n1, &c_b15, a, n, &c_b16, &a[n1], n); @@ -862,7 +862,7 @@ f"> */ cpotrf_("U", &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } ctrsm_("L", "U", "C", "N", &n1, &n2, &c_b1, a, &n1, &a[n1 * n1], &n1); @@ -881,7 +881,7 @@ f"> */ cpotrf_("U", &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } ctrsm_("R", "U", "N", "N", &n2, &n1, &c_b1, &a[n2 * n2], &n2, a, &n2); @@ -913,7 +913,7 @@ f"> */ i__1 = *n + 1; cpotrf_("L", &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -938,7 +938,7 @@ f"> */ i__1 = *n + 1; cpotrf_("L", &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -968,7 +968,7 @@ f"> */ cpotrf_("U", &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } ctrsm_("L", "U", "C", "N", &k, &k, &c_b1, &a[k], &n1, &a[k * ( k + 1)], &k); @@ -987,7 +987,7 @@ f"> */ cpotrf_("U", &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } ctrsm_("R", "U", "N", "N", &k, &k, &c_b1, &a[k * (k + 1)], &k, a, &k); @@ -1003,7 +1003,7 @@ f"> */ } - return 0; + return; /* End of CPFTRF */ diff --git a/lapack-netlib/SRC/cpftri.c b/lapack-netlib/SRC/cpftri.c index 0eb513032f..9005b128af 100644 --- a/lapack-netlib/SRC/cpftri.c +++ b/lapack-netlib/SRC/cpftri.c @@ -727,7 +727,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpftri_(char *transr, char *uplo, integer *n, complex *a, +/* Subroutine */ void cpftri_(char *transr, char *uplo, integer *n, complex *a, integer *info) { /* System generated locals */ @@ -736,10 +736,10 @@ f"> */ /* Local variables */ integer k; logical normaltransr; - extern /* Subroutine */ int cherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical lower; @@ -747,7 +747,8 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int clauum_(char *, integer *, complex *, integer - *, integer *), ctftri_(char *, char *, char *, integer *, + *, integer *); + extern void ctftri_(char *, char *, char *, integer *, complex *, integer *); @@ -775,20 +776,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ ctftri_(transr, uplo, "N", n, a, info); if (*info > 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -963,7 +964,7 @@ f"> */ } - return 0; + return; /* End of CPFTRI */ diff --git a/lapack-netlib/SRC/cpftrs.c b/lapack-netlib/SRC/cpftrs.c index c6f404bc7a..2feb7e53bb 100644 --- a/lapack-netlib/SRC/cpftrs.c +++ b/lapack-netlib/SRC/cpftrs.c @@ -733,7 +733,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpftrs_(char *transr, char *uplo, integer *n, integer * +/* Subroutine */ void cpftrs_(char *transr, char *uplo, integer *n, integer * nrhs, complex *a, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -742,7 +742,7 @@ f"> */ /* Local variables */ logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctfsm_(char *, char *, char *, char *, char *, + extern /* Subroutine */ void ctfsm_(char *, char *, char *, char *, char *, integer *, integer *, complex *, complex *, complex *, integer *); logical lower; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -782,13 +782,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPFTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* start execution: there are two triangular solves */ @@ -805,7 +805,7 @@ f"> */ ldb); } - return 0; + return; /* End of CPFTRS */ diff --git a/lapack-netlib/SRC/cpocon.c b/lapack-netlib/SRC/cpocon.c index d4071768f5..6665d00377 100644 --- a/lapack-netlib/SRC/cpocon.c +++ b/lapack-netlib/SRC/cpocon.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpocon_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cpocon_(char *uplo, integer *n, complex *a, integer *lda, real *anorm, real *rcond, complex *work, real *rwork, integer *info) { /* System generated locals */ @@ -646,7 +646,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ix; extern integer icamax_(integer *, complex *, integer *); @@ -655,7 +655,7 @@ f"> */ real scaleu; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real *, real *, integer *), csrscl_(integer *, real *, complex *, integer *); @@ -696,7 +696,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -704,9 +704,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -766,7 +766,7 @@ f"> */ } L20: - return 0; + return; /* End of CPOCON */ diff --git a/lapack-netlib/SRC/cpoequ.c b/lapack-netlib/SRC/cpoequ.c index f09ef845b6..f7f9e87cfe 100644 --- a/lapack-netlib/SRC/cpoequ.c +++ b/lapack-netlib/SRC/cpoequ.c @@ -622,7 +622,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpoequ_(integer *n, complex *a, integer *lda, real *s, +/* Subroutine */ void cpoequ_(integer *n, complex *a, integer *lda, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -662,7 +662,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -670,7 +670,7 @@ f"> */ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } /* Find the minimum and maximum diagonal elements. */ @@ -700,7 +700,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L20: */ } @@ -719,7 +719,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of CPOEQU */ diff --git a/lapack-netlib/SRC/cpoequb.c b/lapack-netlib/SRC/cpoequb.c index 0a9eb9648d..b7c246116d 100644 --- a/lapack-netlib/SRC/cpoequb.c +++ b/lapack-netlib/SRC/cpoequb.c @@ -628,7 +628,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpoequb_(integer *n, complex *a, integer *lda, real *s, +/* Subroutine */ void cpoequb_(integer *n, complex *a, integer *lda, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -672,7 +672,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CPOEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -680,7 +680,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } base = slamch_("B"); tmp = -.5f / log(base); @@ -713,7 +713,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L20: */ } @@ -734,7 +734,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of CPOEQUB */ diff --git a/lapack-netlib/SRC/cporfs.c b/lapack-netlib/SRC/cporfs.c index 4d7f78bf0c..5c3237ec99 100644 --- a/lapack-netlib/SRC/cporfs.c +++ b/lapack-netlib/SRC/cporfs.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cporfs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cporfs_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, complex *af, integer *ldaf, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -713,22 +713,23 @@ f"> */ integer i__, j, k; real s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int chemv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void chemv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpotrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cpotrs_( char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real lstres, eps; @@ -784,7 +785,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPORFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -796,7 +797,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1029,7 +1030,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CPORFS */ diff --git a/lapack-netlib/SRC/cposv.c b/lapack-netlib/SRC/cposv.c index 93e6759fe3..2a24682ac1 100644 --- a/lapack-netlib/SRC/cposv.c +++ b/lapack-netlib/SRC/cposv.c @@ -639,7 +639,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexPOsolve */ /* ===================================================================== */ -/* Subroutine */ int cposv_(char *uplo, integer *n, integer *nrhs, complex *a, +/* Subroutine */ void cposv_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -647,9 +647,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpotrf_( - char *, integer *, complex *, integer *, integer *), - cpotrs_(char *, integer *, integer *, complex *, integer *, + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int cpotrf_( + char *, integer *, complex *, integer *, integer *); + extern void cpotrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -688,7 +689,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CPOSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**H*U or A = L*L**H. */ @@ -701,7 +702,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ cpotrs_(uplo, n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); } - return 0; + return; /* End of CPOSV */ diff --git a/lapack-netlib/SRC/cposvx.c b/lapack-netlib/SRC/cposvx.c index 6e9cf48a81..10161efaf0 100644 --- a/lapack-netlib/SRC/cposvx.c +++ b/lapack-netlib/SRC/cposvx.c @@ -813,7 +813,7 @@ f"> */ /* > \ingroup complexPOsolve */ /* ===================================================================== */ -/* Subroutine */ int cposvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void cposvx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, char * equed, real *s, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, real *rwork, @@ -833,23 +833,25 @@ f"> */ logical equil, rcequ; extern real clanhe_(char *, char *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int claqhe_(char *, integer *, complex *, integer + extern /* Subroutine */ void claqhe_(char *, integer *, complex *, integer *, real *, real *, real *, char *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int cpocon_(char *, integer *, complex *, integer + extern /* Subroutine */ void cpocon_(char *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *); integer infequ; - extern /* Subroutine */ int cpoequ_(integer *, complex *, integer *, real + extern /* Subroutine */ void cpoequ_(integer *, complex *, integer *, real *, real *, real *, integer *), cporfs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, - integer *), cpotrf_(char *, integer *, complex *, integer - *, integer *), cpotrs_(char *, integer *, integer *, + integer *); + extern int cpotrf_(char *, integer *, complex *, integer + *, integer *); + extern void cpotrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real smlnum; @@ -947,7 +949,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -993,7 +995,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -1048,7 +1050,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CPOSVX */ diff --git a/lapack-netlib/SRC/cposvxx.c b/lapack-netlib/SRC/cposvxx.c index f4614361b3..d36d9b4a35 100644 --- a/lapack-netlib/SRC/cposvxx.c +++ b/lapack-netlib/SRC/cposvxx.c @@ -1002,7 +1002,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexPOsolve */ /* ===================================================================== */ -/* Subroutine */ int cposvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void cposvxx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, char * equed, real *s, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, integer *n_err_bnds__, real * @@ -1023,20 +1023,20 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); real scond; logical equil, rcequ; - extern /* Subroutine */ int claqhe_(char *, integer *, complex *, integer + extern /* Subroutine */ void claqhe_(char *, integer *, complex *, integer *, real *, real *, real *, char *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int cpotrf_(char *, integer *, complex *, integer + extern /* Subroutine */ void cpotrf_(char *, integer *, complex *, integer *, integer *), cpotrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real smlnum; - extern /* Subroutine */ int clascl2_(integer *, integer *, real *, + extern /* Subroutine */ void clascl2_(integer *, integer *, real *, complex *, integer *), cpoequb_(integer *, complex *, integer *, real *, real *, real *, integer *), cporfsx_(char *, char *, integer *, integer *, complex *, integer *, complex *, integer *, @@ -1150,7 +1150,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CPOSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1190,7 +1190,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = cla_porpvgrw_(uplo, n, &a[a_offset], lda, &af[ af_offset], ldaf, &rwork[1]); - return 0; + return; } } @@ -1219,7 +1219,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ clascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of CPOSVXX */ diff --git a/lapack-netlib/SRC/cpotf2.c b/lapack-netlib/SRC/cpotf2.c index 63153797dc..6d6d4182f0 100644 --- a/lapack-netlib/SRC/cpotf2.c +++ b/lapack-netlib/SRC/cpotf2.c @@ -622,7 +622,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpotf2_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cpotf2_(char *uplo, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -635,13 +635,13 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); logical upper; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - csscal_(integer *, real *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *), + csscal_(integer *, real *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern logical sisnan_(real *); real ajj; @@ -675,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -778,7 +778,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of CPOTF2 */ diff --git a/lapack-netlib/SRC/cpotrf.c b/lapack-netlib/SRC/cpotrf.c index ded299f8d5..5395ad952c 100644 --- a/lapack-netlib/SRC/cpotrf.c +++ b/lapack-netlib/SRC/cpotrf.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpotrf_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cpotrf_(char *uplo, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -633,13 +633,13 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *), cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; @@ -647,7 +647,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cpotrf2_(char *, integer *, complex *, + extern /* Subroutine */ void cpotrf2_(char *, integer *, complex *, integer *, integer *); @@ -680,13 +680,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -788,7 +788,7 @@ f"> */ *info = *info + j - 1; L40: - return 0; + return; /* End of CPOTRF */ diff --git a/lapack-netlib/SRC/cpotrf2.c b/lapack-netlib/SRC/cpotrf2.c index cadc520d5c..ff478e7740 100644 --- a/lapack-netlib/SRC/cpotrf2.c +++ b/lapack-netlib/SRC/cpotrf2.c @@ -618,7 +618,7 @@ static real c_b12 = 1.f; /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpotrf2_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void cpotrf2_(char *uplo, integer *n, complex *a, integer * lda, integer *info) { /* System generated locals */ @@ -626,11 +626,11 @@ static real c_b12 = 1.f; real r__1; /* Local variables */ - extern /* Subroutine */ int cherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real *, complex *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; @@ -669,13 +669,13 @@ static real c_b12 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("CPOTRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* N=1 case */ @@ -688,7 +688,7 @@ static real c_b12 = 1.f; ajj = a[i__1].r; if (ajj <= 0.f || sisnan_(&ajj)) { *info = 1; - return 0; + return; } /* Factor */ @@ -708,7 +708,7 @@ static real c_b12 = 1.f; cpotrf2_(uplo, &n1, &a[a_dim1 + 1], lda, &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* Compute the Cholesky factorization A = U**H*U */ @@ -729,7 +729,7 @@ static real c_b12 = 1.f; if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } /* Compute the Cholesky factorization A = L*L**H */ @@ -750,12 +750,12 @@ static real c_b12 = 1.f; if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } } } - return 0; + return; /* End of CPOTRF2 */ diff --git a/lapack-netlib/SRC/cpotri.c b/lapack-netlib/SRC/cpotri.c index 79e9d8954c..6681781e32 100644 --- a/lapack-netlib/SRC/cpotri.c +++ b/lapack-netlib/SRC/cpotri.c @@ -604,7 +604,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpotri_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cpotri_(char *uplo, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -612,9 +612,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), clauum_( - char *, integer *, complex *, integer *, integer *), - ctrtri_(char *, char *, integer *, complex *, integer *, integer * + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int clauum_( + char *, integer *, complex *, integer *, integer *); + extern int ctrtri_(char *, char *, integer *, complex *, integer *, integer * ); @@ -646,27 +647,27 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ ctrtri_(uplo, "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } /* Form inv(U) * inv(U)**H or inv(L)**H * inv(L). */ clauum_(uplo, n, &a[a_offset], lda, info); - return 0; + return; /* End of CPOTRI */ diff --git a/lapack-netlib/SRC/cpotrs.c b/lapack-netlib/SRC/cpotrs.c index e110b9641d..dfedbef850 100644 --- a/lapack-netlib/SRC/cpotrs.c +++ b/lapack-netlib/SRC/cpotrs.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpotrs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cpotrs_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,7 +631,7 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPOTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -711,7 +711,7 @@ f"> */ c_b1, &a[a_offset], lda, &b[b_offset], ldb); } - return 0; + return; /* End of CPOTRS */ diff --git a/lapack-netlib/SRC/cppcon.c b/lapack-netlib/SRC/cppcon.c index 07899b71b4..ad4cbd1ee1 100644 --- a/lapack-netlib/SRC/cppcon.c +++ b/lapack-netlib/SRC/cppcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cppcon_(char *uplo, integer *n, complex *ap, real *anorm, +/* Subroutine */ void cppcon_(char *uplo, integer *n, complex *ap, real *anorm, real *rcond, complex *work, real *rwork, integer *info) { /* System generated locals */ @@ -644,18 +644,19 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ix; extern integer icamax_(integer *, complex *, integer *); real scalel; extern real slamch_(char *); real scaleu; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), clatps_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clatps_( char *, char *, char *, char *, integer *, complex *, complex *, real *, real *, integer *); real ainvnm; - extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + extern /* Subroutine */ void csrscl_(integer *, real *, complex *, integer *); char normin[1]; real smlnum; @@ -690,7 +691,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -698,9 +699,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -760,7 +761,7 @@ f"> */ } L20: - return 0; + return; /* End of CPPCON */ diff --git a/lapack-netlib/SRC/cppequ.c b/lapack-netlib/SRC/cppequ.c index f4bdf882c3..27e735cdf0 100644 --- a/lapack-netlib/SRC/cppequ.c +++ b/lapack-netlib/SRC/cppequ.c @@ -626,7 +626,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cppequ_(char *uplo, integer *n, complex *ap, real *s, +/* Subroutine */ void cppequ_(char *uplo, integer *n, complex *ap, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -668,7 +668,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -676,7 +676,7 @@ f"> */ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } /* Initialize SMIN and AMAX. */ @@ -734,7 +734,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L30: */ } @@ -753,7 +753,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of CPPEQU */ diff --git a/lapack-netlib/SRC/cpprfs.c b/lapack-netlib/SRC/cpprfs.c index 8d0b934a74..185f15996c 100644 --- a/lapack-netlib/SRC/cpprfs.c +++ b/lapack-netlib/SRC/cpprfs.c @@ -684,7 +684,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpprfs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cpprfs_(char *uplo, integer *n, integer *nrhs, complex * ap, complex *afp, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) { @@ -700,20 +700,21 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), chpmv_(char *, integer *, complex *, complex *, complex *, integer *, complex *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ik, kk; real xk; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpptrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cpptrs_( char *, integer *, integer *, complex *, complex *, integer *, integer *); real lstres, eps; @@ -761,7 +762,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -773,7 +774,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1013,7 +1014,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CPPRFS */ diff --git a/lapack-netlib/SRC/cppsv.c b/lapack-netlib/SRC/cppsv.c index 19882e6710..4f2ae517d1 100644 --- a/lapack-netlib/SRC/cppsv.c +++ b/lapack-netlib/SRC/cppsv.c @@ -653,7 +653,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cppsv_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cppsv_(char *uplo, integer *n, integer *nrhs, complex * ap, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -661,7 +661,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cpptrf_( char *, integer *, complex *, integer *), cpptrs_(char *, integer *, integer *, complex *, complex *, integer *, integer *); @@ -697,7 +698,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CPPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**H *U or A = L*L**H. */ @@ -710,7 +711,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ cpptrs_(uplo, n, nrhs, &ap[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of CPPSV */ diff --git a/lapack-netlib/SRC/cppsvx.c b/lapack-netlib/SRC/cppsvx.c index 4005ea3d43..9798ed06f5 100644 --- a/lapack-netlib/SRC/cppsvx.c +++ b/lapack-netlib/SRC/cppsvx.c @@ -823,7 +823,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cppsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void cppsvx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *ap, complex *afp, char *equed, real *s, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -838,27 +838,27 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); real scond, anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical equil, rcequ; extern real clanhp_(char *, char *, integer *, complex *, real *), slamch_(char *); - extern /* Subroutine */ int claqhp_(char *, integer *, complex *, real *, + extern /* Subroutine */ void claqhp_(char *, integer *, complex *, real *, real *, real *, char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int cppcon_(char *, integer *, complex *, real *, + extern /* Subroutine */ void cppcon_(char *, integer *, complex *, real *, real *, complex *, real *, integer *); integer infequ; - extern /* Subroutine */ int cppequ_(char *, integer *, complex *, real *, + extern /* Subroutine */ void cppequ_(char *, integer *, complex *, real *, real *, real *, integer *), cpprfs_(char *, integer *, integer *, complex *, complex *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, integer *), cpptrf_(char *, integer *, complex *, integer *); real smlnum; - extern /* Subroutine */ int cpptrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void cpptrs_(char *, integer *, integer *, complex *, complex *, integer *, integer *); @@ -947,7 +947,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -994,7 +994,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -1047,7 +1047,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CPPSVX */ diff --git a/lapack-netlib/SRC/cpptrf.c b/lapack-netlib/SRC/cpptrf.c index f53353a5f7..711fb200b0 100644 --- a/lapack-netlib/SRC/cpptrf.c +++ b/lapack-netlib/SRC/cpptrf.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cpptrf_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void cpptrf_(char *uplo, integer *n, complex *ap, integer * info) { /* System generated locals */ @@ -642,18 +642,19 @@ f"> */ complex q__1, q__2; /* Local variables */ - extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, + extern /* Subroutine */ void chpr_(char *, integer *, real *, complex *, integer *, complex *); integer j; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *); integer jc, jj; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real ajj; @@ -682,13 +683,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -769,7 +770,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of CPPTRF */ diff --git a/lapack-netlib/SRC/cpptri.c b/lapack-netlib/SRC/cpptri.c index 17f12de1cc..1d99d3e88e 100644 --- a/lapack-netlib/SRC/cpptri.c +++ b/lapack-netlib/SRC/cpptri.c @@ -607,7 +607,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpptri_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void cpptri_(char *uplo, integer *n, complex *ap, integer * info) { /* System generated locals */ @@ -616,18 +616,20 @@ f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int chpr_(char *, integer *, real *, complex *, + extern /* Subroutine */ void chpr_(char *, integer *, real *, complex *, integer *, complex *); integer j; extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; integer jc, jj; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen), ctptri_(char *, char *, + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); + extern void ctptri_(char *, char *, integer *, complex *, integer *); real ajj; integer jjn; @@ -658,20 +660,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ ctptri_(uplo, "Non-unit", n, &ap[1], info); if (*info > 0) { - return 0; + return; } if (upper) { @@ -715,7 +717,7 @@ f"> */ } } - return 0; + return; /* End of CPPTRI */ diff --git a/lapack-netlib/SRC/cpptrs.c b/lapack-netlib/SRC/cpptrs.c index 2b8a083329..6251b1e535 100644 --- a/lapack-netlib/SRC/cpptrs.c +++ b/lapack-netlib/SRC/cpptrs.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpptrs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cpptrs_(char *uplo, integer *n, integer *nrhs, complex * ap, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,9 +631,9 @@ f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, - complex *, complex *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, + complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -668,13 +668,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -715,7 +715,7 @@ f"> */ } } - return 0; + return; /* End of CPPTRS */ diff --git a/lapack-netlib/SRC/cpstf2.c b/lapack-netlib/SRC/cpstf2.c index 5e6ed763e5..f0ac9bf924 100644 --- a/lapack-netlib/SRC/cpstf2.c +++ b/lapack-netlib/SRC/cpstf2.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpstf2_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cpstf2_(char *uplo, integer *n, complex *a, integer *lda, integer *piv, integer *rank, real *tol, real *work, integer *info) { /* System generated locals */ @@ -670,20 +670,21 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); complex ctemp; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp; real stemp; logical upper; real sstop; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); extern logical sisnan_(real *); real ajj; integer pvt; @@ -720,13 +721,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPSTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize PIV */ @@ -995,7 +996,7 @@ f"> */ *info = 1; L200: - return 0; + return; /* End of CPSTF2 */ diff --git a/lapack-netlib/SRC/cpstrf.c b/lapack-netlib/SRC/cpstrf.c index cb1bbd3fd4..e8cb87df42 100644 --- a/lapack-netlib/SRC/cpstrf.c +++ b/lapack-netlib/SRC/cpstrf.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpstrf_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cpstrf_(char *uplo, integer *n, complex *a, integer *lda, integer *piv, integer *rank, real *tol, real *work, integer *info) { /* System generated locals */ @@ -672,26 +672,27 @@ f"> */ /* Local variables */ integer i__, j, k; - extern /* Subroutine */ int cherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cherk_(char *, char *, integer *, integer *, real *, complex *, integer *, real *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); complex ctemp; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp; real stemp; logical upper; real sstop; - extern /* Subroutine */ int cpstf2_(char *, integer *, complex *, integer + extern /* Subroutine */ void cpstf2_(char *, integer *, complex *, integer *, integer *, integer *, real *, real *, integer *); integer jb, nb; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern logical sisnan_(real *); @@ -730,13 +731,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPSTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get block size */ @@ -1076,7 +1077,7 @@ f"> */ *info = 1; L230: - return 0; + return; /* End of CPSTRF */ diff --git a/lapack-netlib/SRC/cptcon.c b/lapack-netlib/SRC/cptcon.c index b1585da7e0..829e7bd50f 100644 --- a/lapack-netlib/SRC/cptcon.c +++ b/lapack-netlib/SRC/cptcon.c @@ -632,7 +632,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cptcon_(integer *n, real *d__, complex *e, real *anorm, +/* Subroutine */ void cptcon_(integer *n, real *d__, complex *e, real *anorm, real *rcond, real *rwork, integer *info) { /* System generated locals */ @@ -672,7 +672,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -680,9 +680,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } /* Check that D(1:N) is positive. */ @@ -690,7 +690,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (d__[i__] <= 0.f) { - return 0; + return; } /* L10: */ } @@ -730,7 +730,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CPTCON */ diff --git a/lapack-netlib/SRC/cpteqr.c b/lapack-netlib/SRC/cpteqr.c index 9856e544f5..b382724e0d 100644 --- a/lapack-netlib/SRC/cpteqr.c +++ b/lapack-netlib/SRC/cpteqr.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup complexPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpteqr_(char *compz, integer *n, real *d__, real *e, +/* Subroutine */ void cpteqr_(char *compz, integer *n, real *d__, real *e, complex *z__, integer *ldz, real *work, integer *info) { /* System generated locals */ @@ -672,13 +672,14 @@ f"> */ integer i__; extern logical lsame_(char *, char *); complex vt[1] /* was [1][1] */; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *, ftnlen), cbdsqr_(char *, integer *, integer *, integer + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cbdsqr_(char *, integer *, integer *, integer *, integer *, real *, real *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *); integer icompz; - extern /* Subroutine */ int spttrf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void spttrf_(integer *, real *, real *, integer *); integer nru; @@ -723,13 +724,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -737,7 +738,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } if (icompz == 2) { claset_("Full", n, n, &c_b1, &c_b2, &z__[z_offset], ldz); @@ -747,7 +748,7 @@ f"> */ spttrf_(n, &d__[1], &e[1], info); if (*info != 0) { - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -783,7 +784,7 @@ f"> */ *info = *n + *info; } - return 0; + return; /* End of CPTEQR */ diff --git a/lapack-netlib/SRC/cptrfs.c b/lapack-netlib/SRC/cptrfs.c index dbab40f70d..4704650114 100644 --- a/lapack-netlib/SRC/cptrfs.c +++ b/lapack-netlib/SRC/cptrfs.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complexPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cptrfs_(char *uplo, integer *n, integer *nrhs, real *d__, +/* Subroutine */ void cptrfs_(char *uplo, integer *n, integer *nrhs, real *d__, complex *e, real *df, complex *ef, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -713,7 +713,7 @@ f"> */ integer i__, j; real s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; @@ -725,7 +725,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); real lstres; - extern /* Subroutine */ int cpttrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void cpttrs_(char *, integer *, integer *, real *, complex *, complex *, integer *, integer *); real eps; @@ -774,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1143,7 +1143,7 @@ f"> */ /* L100: */ } - return 0; + return; /* End of CPTRFS */ diff --git a/lapack-netlib/SRC/cptsv.c b/lapack-netlib/SRC/cptsv.c index f018d7553d..b2e3edbff1 100644 --- a/lapack-netlib/SRC/cptsv.c +++ b/lapack-netlib/SRC/cptsv.c @@ -624,14 +624,15 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexPTsolve */ /* ===================================================================== */ -/* Subroutine */ int cptsv_(integer *n, integer *nrhs, real *d__, complex *e, +/* Subroutine */ void cptsv_(integer *n, integer *nrhs, real *d__, complex *e, complex *b, integer *ldb, integer *info) { /* System generated locals */ integer b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cpttrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cpttrf_( integer *, real *, complex *, integer *), cpttrs_(char *, integer *, integer *, real *, complex *, complex *, integer *, integer *); @@ -666,7 +667,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CPTSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the L*D*L**H (or U**H*D*U) factorization of A. */ @@ -678,7 +679,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ cpttrs_("Lower", n, nrhs, &d__[1], &e[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of CPTSV */ diff --git a/lapack-netlib/SRC/cptsvx.c b/lapack-netlib/SRC/cptsvx.c index c0e96951e8..7c0c26251d 100644 --- a/lapack-netlib/SRC/cptsvx.c +++ b/lapack-netlib/SRC/cptsvx.c @@ -746,7 +746,7 @@ f"> */ /* > \ingroup complexPTsolve */ /* ===================================================================== */ -/* Subroutine */ int cptsvx_(char *fact, integer *n, integer *nrhs, real *d__, +/* Subroutine */ void cptsvx_(char *fact, integer *n, integer *nrhs, real *d__, complex *e, real *df, complex *ef, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -757,15 +757,16 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), scopy_(integer *, real *, integer *, real * , integer *); extern real slamch_(char *), clanht_(char *, integer *, real *, complex *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen), cptcon_(integer *, real *, complex *, real *, + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void cptcon_(integer *, real *, complex *, real *, real *, real *, integer *), cptrfs_(char *, integer *, integer *, real *, complex *, real *, complex *, complex *, integer *, complex *, integer *, real *, real *, complex *, real *, integer * @@ -818,7 +819,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -836,7 +837,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -866,7 +867,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CPTSVX */ diff --git a/lapack-netlib/SRC/cpttrf.c b/lapack-netlib/SRC/cpttrf.c index 592ceb877c..d071efc185 100644 --- a/lapack-netlib/SRC/cpttrf.c +++ b/lapack-netlib/SRC/cpttrf.c @@ -601,7 +601,7 @@ f"> */ /* > \ingroup complexPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpttrf_(integer *n, real *d__, complex *e, integer *info) +/* Subroutine */ void cpttrf_(integer *n, real *d__, complex *e, integer *info) { /* System generated locals */ integer i__1, i__2; @@ -635,13 +635,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("CPTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the L*D*L**H (or U**H *D*U) factorization of A. */ @@ -748,7 +748,7 @@ f"> */ } L20: - return 0; + return; /* End of CPTTRF */ diff --git a/lapack-netlib/SRC/cpttrs.c b/lapack-netlib/SRC/cpttrs.c index abe967d69b..803ed32618 100644 --- a/lapack-netlib/SRC/cpttrs.c +++ b/lapack-netlib/SRC/cpttrs.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup complexPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cpttrs_(char *uplo, integer *n, integer *nrhs, real *d__, +/* Subroutine */ void cpttrs_(char *uplo, integer *n, integer *nrhs, real *d__, complex *e, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -645,7 +645,7 @@ f"> */ integer j, iuplo; logical upper; integer jb; - extern /* Subroutine */ int cptts2_(integer *, integer *, integer *, real + extern /* Subroutine */ void cptts2_(integer *, integer *, integer *, real *, complex *, complex *, integer *); integer nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -687,13 +687,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CPTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Determine the number of right-hand sides to solve at a time. */ @@ -729,7 +729,7 @@ f"> */ } } - return 0; + return; /* End of CPTTRS */ diff --git a/lapack-netlib/SRC/cptts2.c b/lapack-netlib/SRC/cptts2.c index ed66ca2112..d6f5c0aff7 100644 --- a/lapack-netlib/SRC/cptts2.c +++ b/lapack-netlib/SRC/cptts2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int cptts2_(integer *iuplo, integer *n, integer *nrhs, real * +/* Subroutine */ void cptts2_(integer *iuplo, integer *n, integer *nrhs, real * d__, complex *e, complex *b, integer *ldb) { /* System generated locals */ @@ -633,7 +633,7 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); @@ -661,7 +661,7 @@ f"> */ r__1 = 1.f / d__[1]; csscal_(nrhs, &r__1, &b[b_offset], ldb); } - return 0; + return; } if (*iuplo == 1) { @@ -860,7 +860,7 @@ f"> */ } } - return 0; + return; /* End of CPTTS2 */ diff --git a/lapack-netlib/SRC/crot.c b/lapack-netlib/SRC/crot.c index 93782ce8f7..eed83d90a3 100644 --- a/lapack-netlib/SRC/crot.c +++ b/lapack-netlib/SRC/crot.c @@ -613,7 +613,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int crot_(integer *n, complex *cx, integer *incx, complex * +/* Subroutine */ void crot_(integer *n, complex *cx, integer *incx, complex * cy, integer *incy, real *c__, complex *s) { /* System generated locals */ @@ -641,7 +641,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (*incx == 1 && *incy == 1) { goto L20; @@ -681,7 +681,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ iy += *incy; /* L10: */ } - return 0; + return; /* Code for both increments equal to 1 */ @@ -708,6 +708,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ cx[i__2].r = stemp.r, cx[i__2].i = stemp.i; /* L30: */ } - return 0; + return; } /* crot_ */ diff --git a/lapack-netlib/SRC/cspcon.c b/lapack-netlib/SRC/cspcon.c index 0c4dfcada1..c03725281d 100644 --- a/lapack-netlib/SRC/cspcon.c +++ b/lapack-netlib/SRC/cspcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cspcon_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void cspcon_(char *uplo, integer *n, complex *ap, integer * ipiv, real *anorm, real *rcond, complex *work, integer *info) { /* System generated locals */ @@ -642,12 +642,12 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csptrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csptrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -680,7 +680,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -688,9 +688,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -703,7 +703,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = ip; if (ipiv[i__] > 0 && (ap[i__1].r == 0.f && ap[i__1].i == 0.f)) { - return 0; + return; } ip -= i__; /* L10: */ @@ -717,7 +717,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = ip; if (ipiv[i__] > 0 && (ap[i__2].r == 0.f && ap[i__2].i == 0.f)) { - return 0; + return; } ip = ip + *n - i__ + 1; /* L20: */ @@ -743,7 +743,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CSPCON */ diff --git a/lapack-netlib/SRC/cspmv.c b/lapack-netlib/SRC/cspmv.c index 743ab8a200..03622bcfc0 100644 --- a/lapack-netlib/SRC/cspmv.c +++ b/lapack-netlib/SRC/cspmv.c @@ -661,7 +661,7 @@ rix */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int cspmv_(char *uplo, integer *n, complex *alpha, complex * +/* Subroutine */ void cspmv_(char *uplo, integer *n, complex *alpha, complex * ap, complex *x, integer *incx, complex *beta, complex *y, integer * incy) { @@ -707,14 +707,14 @@ rix */ } if (info != 0) { xerbla_("CSPMV ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f)) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -782,7 +782,7 @@ rix */ } } if (alpha->r == 0.f && alpha->i == 0.f) { - return 0; + return; } kk = 1; if (lsame_(uplo, "U")) { @@ -983,7 +983,7 @@ rix */ } } - return 0; + return; /* End of CSPMV */ diff --git a/lapack-netlib/SRC/cspr.c b/lapack-netlib/SRC/cspr.c index 4735cb91f8..2ea2b048e0 100644 --- a/lapack-netlib/SRC/cspr.c +++ b/lapack-netlib/SRC/cspr.c @@ -641,7 +641,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int cspr_(char *uplo, integer *n, complex *alpha, complex *x, +/* Subroutine */ void cspr_(char *uplo, integer *n, complex *alpha, complex *x, integer *incx, complex *ap) { /* System generated locals */ @@ -683,13 +683,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("CSPR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f) { - return 0; + return; } /* Set the start point in X if the increment is not unity. */ @@ -884,7 +884,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of CSPR */ diff --git a/lapack-netlib/SRC/csprfs.c b/lapack-netlib/SRC/csprfs.c index 14708f80ca..9b7979c890 100644 --- a/lapack-netlib/SRC/csprfs.c +++ b/lapack-netlib/SRC/csprfs.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int csprfs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void csprfs_(char *uplo, integer *n, integer *nrhs, complex * ap, complex *afp, integer *ipiv, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -710,14 +710,14 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; - extern /* Subroutine */ int cspmv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void cspmv_(char *, integer *, complex *, complex * , complex *, integer *, complex *, complex *, integer *); logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); integer ik, kk; real xk; @@ -726,7 +726,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int csptrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csptrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); real eps; @@ -774,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1026,7 +1026,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CSPRFS */ diff --git a/lapack-netlib/SRC/cspsv.c b/lapack-netlib/SRC/cspsv.c index 6b552b24cb..84fe88f4ef 100644 --- a/lapack-netlib/SRC/cspsv.c +++ b/lapack-netlib/SRC/cspsv.c @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cspsv_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void cspsv_(char *uplo, integer *n, integer *nrhs, complex * ap, integer *ipiv, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -679,7 +679,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), csptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void csptrf_( char *, integer *, complex *, integer *, integer *), csptrs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -717,7 +718,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CSPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -730,7 +731,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ csptrs_(uplo, n, nrhs, &ap[1], &ipiv[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of CSPSV */ diff --git a/lapack-netlib/SRC/cspsvx.c b/lapack-netlib/SRC/cspsvx.c index 5e78be9fb4..3caff1f525 100644 --- a/lapack-netlib/SRC/cspsvx.c +++ b/lapack-netlib/SRC/cspsvx.c @@ -789,7 +789,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cspsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void cspsvx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *ap, complex *afp, integer *ipiv, complex *b, integer * ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -800,15 +800,15 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern real clansp_(char *, char *, integer *, complex *, real *); - extern /* Subroutine */ int cspcon_(char *, integer *, complex *, integer + extern /* Subroutine */ void cspcon_(char *, integer *, complex *, integer *, real *, real *, complex *, integer *), csprfs_(char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, real * @@ -863,7 +863,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -878,7 +878,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -907,7 +907,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of CSPSVX */ diff --git a/lapack-netlib/SRC/csptrf.c b/lapack-netlib/SRC/csptrf.c index f98b9b06b9..9c5b853745 100644 --- a/lapack-netlib/SRC/csptrf.c +++ b/lapack-netlib/SRC/csptrf.c @@ -672,7 +672,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int csptrf_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void csptrf_(char *uplo, integer *n, complex *ap, integer * ipiv, integer *info) { /* System generated locals */ @@ -682,15 +682,15 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int cspr_(char *, integer *, complex *, complex *, + extern /* Subroutine */ void cspr_(char *, integer *, complex *, complex *, integer *, complex *); integer i__, j, k; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; logical upper; @@ -732,7 +732,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1294,7 +1294,7 @@ f"> */ } L110: - return 0; + return; /* End of CSPTRF */ diff --git a/lapack-netlib/SRC/csptri.c b/lapack-netlib/SRC/csptri.c index 1f9e86d56b..610ef89664 100644 --- a/lapack-netlib/SRC/csptri.c +++ b/lapack-netlib/SRC/csptri.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int csptri_(char *uplo, integer *n, complex *ap, integer * +/* Subroutine */ void csptri_(char *uplo, integer *n, complex *ap, integer * ipiv, complex *work, integer *info) { /* System generated locals */ @@ -636,14 +636,14 @@ f"> */ integer j, k; complex t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; - extern /* Subroutine */ int cspmv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void cspmv_(char *, integer *, complex *, complex * , complex *, integer *, complex *, complex *, integer *); logical upper; complex ak; @@ -680,13 +680,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -699,7 +699,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = kp; if (ipiv[*info] > 0 && (ap[i__1].r == 0.f && ap[i__1].i == 0.f)) { - return 0; + return; } kp -= *info; /* L10: */ @@ -713,7 +713,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = kp; if (ipiv[*info] > 0 && (ap[i__2].r == 0.f && ap[i__2].i == 0.f)) { - return 0; + return; } kp = kp + *n - *info + 1; /* L20: */ @@ -1047,7 +1047,7 @@ f"> */ ; } - return 0; + return; /* End of CSPTRI */ diff --git a/lapack-netlib/SRC/csptrs.c b/lapack-netlib/SRC/csptrs.c index 7088e4d0da..0e1e7ffb5a 100644 --- a/lapack-netlib/SRC/csptrs.c +++ b/lapack-netlib/SRC/csptrs.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int csptrs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void csptrs_(char *uplo, integer *n, integer *nrhs, complex * ap, integer *ipiv, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -639,11 +639,11 @@ f"> */ /* Local variables */ complex akm1k; integer j, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -686,13 +686,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1047,7 +1047,7 @@ f"> */ ; } - return 0; + return; /* End of CSPTRS */ diff --git a/lapack-netlib/SRC/csrscl.c b/lapack-netlib/SRC/csrscl.c index cb786ac5de..d7d08c3455 100644 --- a/lapack-netlib/SRC/csrscl.c +++ b/lapack-netlib/SRC/csrscl.c @@ -593,14 +593,14 @@ f"> */ /* > \ingroup complexOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int csrscl_(integer *n, real *sa, complex *sx, integer *incx) +/* Subroutine */ void csrscl_(integer *n, real *sa, complex *sx, integer *incx) { real cden; logical done; real cnum, cden1, cnum1; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); real bignum, smlnum, mul; @@ -621,7 +621,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } /* Get machine parameters */ @@ -668,7 +668,7 @@ f"> */ goto L10; } - return 0; + return; /* End of CSRSCL */ diff --git a/lapack-netlib/SRC/cstedc.c b/lapack-netlib/SRC/cstedc.c index 437c39e96d..1d60e6a977 100644 --- a/lapack-netlib/SRC/cstedc.c +++ b/lapack-netlib/SRC/cstedc.c @@ -729,7 +729,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int cstedc_(char *compz, integer *n, real *d__, real *e, +/* Subroutine */ void cstedc_(char *compz, integer *n, real *d__, real *e, complex *z__, integer *ldz, complex *work, integer *lwork, real * rwork, integer *lrwork, integer *iwork, integer *liwork, integer * info) @@ -743,36 +743,36 @@ f"> */ integer i__, j, k, m; real p; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer lwmin; - extern /* Subroutine */ int claed0_(integer *, integer *, real *, real *, + extern /* Subroutine */ void claed0_(integer *, integer *, real *, real *, complex *, integer *, complex *, integer *, real *, integer *, integer *); integer start, ii, ll; - extern /* Subroutine */ int clacrm_(integer *, integer *, complex *, + extern /* Subroutine */ void clacrm_(integer *, integer *, complex *, integer *, real *, integer *, complex *, integer *, real *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer finish; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer liwmin, icompz; - extern /* Subroutine */ int csteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void csteqr_(char *, integer *, real *, real *, complex *, integer *, real *, integer *); real orgnrm; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer lrwmin; logical lquery; integer smlsiz; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); integer lgn; real eps; @@ -836,10 +836,10 @@ f"> */ lrwmin = *n - 1 << 1; } else if (icompz == 1) { lgn = (integer) (log((real) (*n)) / log(2.f)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } lwmin = *n * *n; @@ -870,22 +870,22 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSTEDC", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz != 0) { i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* If the following conditional clause is removed, then the routine */ @@ -1050,7 +1050,7 @@ f"> */ rwork[1] = (real) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of CSTEDC */ diff --git a/lapack-netlib/SRC/cstegr.c b/lapack-netlib/SRC/cstegr.c index e8ed6ed3d3..836534ef6a 100644 --- a/lapack-netlib/SRC/cstegr.c +++ b/lapack-netlib/SRC/cstegr.c @@ -772,7 +772,7 @@ f"> */ /* > Christof Voemel, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int cstegr_(char *jobz, char *range, integer *n, real *d__, +/* Subroutine */ void cstegr_(char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, complex *z__, integer *ldz, integer *isuppz, real *work, integer *lwork, integer *iwork, integer *liwork, integer * @@ -782,7 +782,7 @@ f"> */ integer z_dim1, z_offset; /* Local variables */ - extern /* Subroutine */ int cstemr_(char *, char *, integer *, real *, + extern /* Subroutine */ void cstemr_(char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, complex *, integer *, integer *, integer *, logical *, real *, integer *, integer *, integer *, integer *); @@ -817,6 +817,6 @@ f"> */ /* End of CSTEGR */ - return 0; + return; } /* cstegr_ */ diff --git a/lapack-netlib/SRC/cstein.c b/lapack-netlib/SRC/cstein.c index 4ac90325c6..335dbaa48c 100644 --- a/lapack-netlib/SRC/cstein.c +++ b/lapack-netlib/SRC/cstein.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cstein_(integer *n, real *d__, real *e, integer *m, real +/* Subroutine */ void cstein_(integer *n, real *d__, real *e, integer *m, real *w, integer *iblock, integer *isplit, complex *z__, integer *ldz, real *work, integer *iwork, integer *ifail, integer *info) { @@ -709,24 +709,25 @@ f"> */ integer jblk, nblk, jmax; extern real snrm2_(integer *, real *, integer *); integer i__, j, iseed[4], gpind, iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer b1, j1; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real ortol; integer indrv1, indrv2, indrv3, indrv4, indrv5, bn, jr; real xj; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slagtf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slagtf_( integer *, real *, real *, real *, real *, real *, real *, integer *, integer *); integer nrmchk; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slagts_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slagts_(integer *, integer *, real *, real *, real *, real *, integer *, real *, real *, integer *); integer blksiz; real onenrm, pertol; - extern /* Subroutine */ int slarnv_(integer *, integer *, integer *, real + extern /* Subroutine */ void slarnv_(integer *, integer *, integer *, real *); real stpcrt, scl, eps, ctr, sep, nrm, tol; integer its; @@ -791,17 +792,17 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSTEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } else if (*n == 1) { i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; - return 0; + return; } /* Get machine constants. */ @@ -1029,7 +1030,7 @@ f"> */ ; } - return 0; + return; /* End of CSTEIN */ diff --git a/lapack-netlib/SRC/cstemr.c b/lapack-netlib/SRC/cstemr.c index 0f621bcc12..ede8ea27ed 100644 --- a/lapack-netlib/SRC/cstemr.c +++ b/lapack-netlib/SRC/cstemr.c @@ -850,7 +850,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int cstemr_(char *jobz, char *range, integer *n, real *d__, +/* Subroutine */ void cstemr_(char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, integer *m, real *w, complex *z__, integer *ldz, integer *nzc, integer *isuppz, logical *tryrac, real *work, integer *lwork, integer *iwork, integer * @@ -866,7 +866,7 @@ f"> */ integer itmp; real tnrm; integer inde2; - extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slae2_(real *, real *, real *, real *, real *) ; integer itmp2; real rtol1, rtol2; @@ -875,16 +875,16 @@ f"> */ integer indgp; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer iindw, ilast; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer lwmin; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantz; real r1, r2; - extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slaev2_(real *, real *, real *, real *, real * , real *, real *); integer jj; real cs; @@ -899,7 +899,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer inderr, iindwk, indgrs, offset; - extern /* Subroutine */ int slarrc_(char *, integer *, real *, real *, + extern /* Subroutine */ void slarrc_(char *, integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer * ), clarrv_(integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, real *, real * @@ -912,13 +912,13 @@ f"> */ integer iinspl, indwrk, ifirst, liwmin, nzcmin; real pivmin, thresh; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slarrj_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarrj_(integer *, real *, real *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int slarrr_(integer *, real *, real *, integer *); + extern /* Subroutine */ void slarrr_(integer *, real *, real *, integer *); real smlnum; - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); logical lquery, zquery; integer iil, iiu; real eps, tmp; @@ -1041,16 +1041,16 @@ f"> */ i__1 = -(*info); xerbla_("CSTEMR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery || zquery) { - return 0; + return; } /* Handle N = 0, 1, and 2 cases immediately */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1069,7 +1069,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } if (*n == 2) { @@ -1224,7 +1224,7 @@ f"> */ work[indgrs], &pivmin, &work[indwrk], &iwork[iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 10; - return 0; + return; } /* Note that if RANGE .NE. 'V', SLARRE computes bounds on the desired */ /* part of the spectrum. All desired eigenvalues are contained in */ @@ -1241,7 +1241,7 @@ f"> */ iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 20; - return 0; + return; } } else { /* SLARRE computes eigenvalues of the (shifted) root representation */ @@ -1310,7 +1310,7 @@ f"> */ slasrt_("I", m, &w[1], &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } } else { i__1 = *m - 1; @@ -1347,7 +1347,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of CSTEMR */ diff --git a/lapack-netlib/SRC/csteqr.c b/lapack-netlib/SRC/csteqr.c index 77139c0231..c85a56dd57 100644 --- a/lapack-netlib/SRC/csteqr.c +++ b/lapack-netlib/SRC/csteqr.c @@ -650,7 +650,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int csteqr_(char *compz, integer *n, real *d__, real *e, +/* Subroutine */ void csteqr_(char *compz, integer *n, real *d__, real *e, complex *z__, integer *ldz, real *work, integer *info) { /* System generated locals */ @@ -659,38 +659,38 @@ f"> */ /* Local variables */ integer lend, jtot; - extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slae2_(real *, real *, real *, real *, real *) ; real b, c__, f, g; integer i__, j, k, l, m; real p, r__, s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int clasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void clasr_(char *, char *, char *, integer *, integer *, real *, real *, complex *, integer *); real anorm; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer l1, lendm1, lendp1; - extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slaev2_(real *, real *, real *, real *, real * , real *, real *); extern real slapy2_(real *, real *); integer ii, mm, iscale; extern real slamch_(char *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real safmax; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer lendsv; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ); real ssfmin; integer nmaxit, icompz; real ssfmax; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); integer lm1, mm1, nm1; real rt1, rt2, eps; integer lsv; @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -752,7 +752,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1.f, z__[i__1].i = 0.f; } - return 0; + return; } /* Determine the unit roundoff and over/underflow thresholds. */ @@ -1125,7 +1125,7 @@ f"> */ } /* L150: */ } - return 0; + return; } goto L10; @@ -1164,7 +1164,7 @@ f"> */ /* L180: */ } } - return 0; + return; /* End of CSTEQR */ diff --git a/lapack-netlib/SRC/csycon.c b/lapack-netlib/SRC/csycon.c index 0d704e6172..4f606a5fe7 100644 --- a/lapack-netlib/SRC/csycon.c +++ b/lapack-netlib/SRC/csycon.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csycon_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void csycon_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, real *anorm, real *rcond, complex *work, integer * info) { @@ -649,10 +649,11 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -689,7 +690,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -697,9 +698,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -711,7 +712,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -723,7 +724,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -749,7 +750,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CSYCON */ diff --git a/lapack-netlib/SRC/csycon_3.c b/lapack-netlib/SRC/csycon_3.c index ad0789c2d9..2bb7f3032c 100644 --- a/lapack-netlib/SRC/csycon_3.c +++ b/lapack-netlib/SRC/csycon_3.c @@ -678,7 +678,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csycon_3_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csycon_3_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, real *anorm, real *rcond, complex * work, integer *info) { @@ -687,15 +687,16 @@ static integer c__1 = 1; /* Local variables */ integer kase; - extern /* Subroutine */ int csytrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void csytrs_3_(char *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; @@ -733,7 +734,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CSYCON_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ @@ -741,9 +742,9 @@ static integer c__1 = 1; *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -755,7 +756,7 @@ static integer c__1 = 1; for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } } } else { @@ -766,7 +767,7 @@ static integer c__1 = 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } } } @@ -791,7 +792,7 @@ static integer c__1 = 1; *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CSYCON_3 */ diff --git a/lapack-netlib/SRC/csycon_rook.c b/lapack-netlib/SRC/csycon_rook.c index 93a1dbeede..ac0fb0105a 100644 --- a/lapack-netlib/SRC/csycon_rook.c +++ b/lapack-netlib/SRC/csycon_rook.c @@ -651,7 +651,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csycon_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void csycon_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, real *anorm, real *rcond, complex *work, integer *info) { @@ -659,14 +659,15 @@ rook.f"> */ integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int csytrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void csytrs_rook_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer kase, i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; @@ -703,7 +704,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYCON_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ @@ -711,9 +712,9 @@ rook.f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -725,7 +726,7 @@ rook.f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -737,7 +738,7 @@ rook.f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -763,7 +764,7 @@ rook.f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of CSYCON_ROOK */ diff --git a/lapack-netlib/SRC/csyconv.c b/lapack-netlib/SRC/csyconv.c index 55bdc59533..f1f7c1c415 100644 --- a/lapack-netlib/SRC/csyconv.c +++ b/lapack-netlib/SRC/csyconv.c @@ -623,7 +623,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csyconv_(char *uplo, char *way, integer *n, complex *a, +/* Subroutine */ void csyconv_(char *uplo, char *way, integer *n, complex *a, integer *lda, integer *ipiv, complex *e, integer *info) { /* System generated locals */ @@ -672,13 +672,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CSYCONV", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -927,7 +927,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } } - return 0; + return; /* End of CSYCONV */ diff --git a/lapack-netlib/SRC/csyconvf.c b/lapack-netlib/SRC/csyconvf.c index 00e40d0d8b..edb900d16a 100644 --- a/lapack-netlib/SRC/csyconvf.c +++ b/lapack-netlib/SRC/csyconvf.c @@ -718,7 +718,7 @@ f.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csyconvf_(char *uplo, char *way, integer *n, complex *a, +/* Subroutine */ void csyconvf_(char *uplo, char *way, integer *n, complex *a, integer *lda, complex *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -727,7 +727,7 @@ f.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; integer ip; @@ -768,13 +768,13 @@ f.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYCONVF", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1090,7 +1090,7 @@ f.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of CSYCONVF */ diff --git a/lapack-netlib/SRC/csyconvf_rook.c b/lapack-netlib/SRC/csyconvf_rook.c index 77baad3b6d..d453eda3ec 100644 --- a/lapack-netlib/SRC/csyconvf_rook.c +++ b/lapack-netlib/SRC/csyconvf_rook.c @@ -709,7 +709,7 @@ f_rook.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csyconvf_rook_(char *uplo, char *way, integer *n, +/* Subroutine */ void csyconvf_rook_(char *uplo, char *way, integer *n, complex *a, integer *lda, complex *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -718,7 +718,7 @@ f_rook.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; integer ip; @@ -760,13 +760,13 @@ f_rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYCONVF_ROOK", &i__1, (ftnlen)13); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1080,7 +1080,7 @@ f_rook.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of CSYCONVF_ROOK */ diff --git a/lapack-netlib/SRC/csyequb.c b/lapack-netlib/SRC/csyequb.c index 492191f0e4..223619d29c 100644 --- a/lapack-netlib/SRC/csyequb.c +++ b/lapack-netlib/SRC/csyequb.c @@ -645,7 +645,7 @@ static integer c__1 = 1; /* > Tech report version: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.1679 */ /* > */ /* ===================================================================== */ -/* Subroutine */ int csyequb_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csyequb_(char *uplo, integer *n, complex *a, integer * lda, real *s, real *scond, real *amax, complex *work, integer *info) { /* System generated locals */ @@ -666,7 +666,7 @@ static integer c__1 = 1; extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); real smlnum, avg, std, tol; @@ -701,7 +701,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CSYEQUB", &i__1, (ftnlen)7); - return 0; + return; } up = lsame_(uplo, "U"); *amax = 0.f; @@ -710,7 +710,7 @@ static integer c__1 = 1; if (*n == 0) { *scond = 1.f; - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -905,7 +905,7 @@ static integer c__1 = 1; d__ = c1 * c1 - c0 * 4 * c2; if (d__ <= 0.f) { *info = -1; - return 0; + return; } si = c0 * -2 / (c1 + sqrt(d__)); d__ = si - s[i__]; @@ -992,6 +992,6 @@ static integer c__1 = 1; } *scond = f2cmax(smin,smlnum) / f2cmin(smax,bignum); - return 0; + return; } /* csyequb_ */ diff --git a/lapack-netlib/SRC/csymv.c b/lapack-netlib/SRC/csymv.c index 78178cdc20..d07419db57 100644 --- a/lapack-netlib/SRC/csymv.c +++ b/lapack-netlib/SRC/csymv.c @@ -667,7 +667,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int csymv_(char *uplo, integer *n, complex *alpha, complex * +/* Subroutine */ void csymv_(char *uplo, integer *n, complex *alpha, complex * a, integer *lda, complex *x, integer *incx, complex *beta, complex *y, integer *incy) { @@ -717,14 +717,14 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("CSYMV ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f)) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -793,7 +793,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } if (alpha->r == 0.f && alpha->i == 0.f) { - return 0; + return; } if (lsame_(uplo, "U")) { @@ -985,7 +985,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of CSYMV */ diff --git a/lapack-netlib/SRC/csyr.c b/lapack-netlib/SRC/csyr.c index 8cc35ccfb8..e684b20246 100644 --- a/lapack-netlib/SRC/csyr.c +++ b/lapack-netlib/SRC/csyr.c @@ -644,7 +644,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int csyr_(char *uplo, integer *n, complex *alpha, complex *x, +/* Subroutine */ void csyr_(char *uplo, integer *n, complex *alpha, complex *x, integer *incx, complex *a, integer *lda) { /* System generated locals */ @@ -690,13 +690,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("CSYR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0.f && alpha->i == 0.f) { - return 0; + return; } /* Set the start point in X if the increment is not unity. */ @@ -835,7 +835,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of CSYR */ diff --git a/lapack-netlib/SRC/csyrfs.c b/lapack-netlib/SRC/csyrfs.c index f242780746..2bd501fc3f 100644 --- a/lapack-netlib/SRC/csyrfs.c +++ b/lapack-netlib/SRC/csyrfs.c @@ -705,7 +705,7 @@ f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csyrfs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void csyrfs_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, complex *af, integer *ldaf, integer *ipiv, complex * b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -723,12 +723,12 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); integer count; logical upper; - extern /* Subroutine */ int csymv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void csymv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ), clacn2_(integer *, complex *, complex *, real *, integer *, integer *); @@ -738,7 +738,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); real eps; @@ -794,7 +794,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -806,7 +806,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1042,7 +1042,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of CSYRFS */ diff --git a/lapack-netlib/SRC/csysv.c b/lapack-netlib/SRC/csysv.c index dcdbbb700f..1eecd1711a 100644 --- a/lapack-netlib/SRC/csysv.c +++ b/lapack-netlib/SRC/csysv.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \ingroup complexSYsolve */ /* ===================================================================== */ -/* Subroutine */ int csysv_(char *uplo, integer *n, integer *nrhs, complex *a, +/* Subroutine */ void csysv_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -692,12 +692,13 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), csytrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void csytrf_( char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), csytrs2_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -755,9 +756,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CSYSV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -787,7 +788,7 @@ static integer c_n1 = -1; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYSV */ diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f index 6f175e381b..4ddabf62fe 100644 --- a/lapack-netlib/SRC/csysv.f +++ b/lapack-netlib/SRC/csysv.f @@ -223,7 +223,7 @@ SUBROUTINE CSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysv_aa.c b/lapack-netlib/SRC/csysv_aa.c index 78328b7893..065159fd33 100644 --- a/lapack-netlib/SRC/csysv_aa.c +++ b/lapack-netlib/SRC/csysv_aa.c @@ -674,7 +674,7 @@ a.f"> */ /* > \ingroup complexSYsolve */ /* ===================================================================== */ -/* Subroutine */ int csysv_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csysv_aa_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -684,10 +684,11 @@ a.f"> */ /* Local variables */ extern logical lsame_(char *, char *); integer lwkopt_sytrf__, lwkopt_sytrs__; - extern /* Subroutine */ int csytrf_aa_(char *, integer *, complex *, + extern /* Subroutine */ void csytrf_aa_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), csytrs_aa_(char *, integer *, integer *, complex *, integer *, - integer *, complex *, integer *, complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, complex *, integer *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -748,9 +749,9 @@ a.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYSV_AA ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U**T*T*U or A = L*T*L**T. */ @@ -767,7 +768,7 @@ a.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYSV_AA */ diff --git a/lapack-netlib/SRC/csysv_aa_2stage.c b/lapack-netlib/SRC/csysv_aa_2stage.c index 3fef43809a..0a6827fd3f 100644 --- a/lapack-netlib/SRC/csysv_aa_2stage.c +++ b/lapack-netlib/SRC/csysv_aa_2stage.c @@ -698,7 +698,7 @@ asen_2stage.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *tb, integer *ltb, integer *ipiv, integer *ipiv2, complex *b, integer *ldb, complex *work, integer * lwork, integer *info) @@ -707,7 +707,7 @@ asen_2stage.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int csytrf_aa_2stage_(char *, integer *, complex + extern /* Subroutine */ void csytrf_aa_2stage_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, complex *, integer *, integer *), csytrs_aa_2stage_(char *, integer *, integer *, complex *, integer *, complex *, integer *, @@ -773,9 +773,9 @@ asen_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYSV_AA_2STAGE", &i__1, (ftnlen)15); - return 0; + return; } else if (wquery || tquery) { - return 0; + return; } @@ -794,7 +794,7 @@ asen_2stage.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYSV_AA_2STAGE */ diff --git a/lapack-netlib/SRC/csysv_rk.c b/lapack-netlib/SRC/csysv_rk.c index 6b9babdb82..4e731fdf8c 100644 --- a/lapack-netlib/SRC/csysv_rk.c +++ b/lapack-netlib/SRC/csysv_rk.c @@ -740,7 +740,7 @@ k.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csysv_rk_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csysv_rk_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *e, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -748,12 +748,13 @@ k.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int csytrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void csytrs_3_(char *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int csytrf_rk_(char *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csytrf_rk_(char *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -811,9 +812,9 @@ k.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYSV_RK ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -832,7 +833,7 @@ k.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYSV_RK */ diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f index 793e39df5a..ef5334dcdf 100644 --- a/lapack-netlib/SRC/csysv_rk.f +++ b/lapack-netlib/SRC/csysv_rk.f @@ -280,7 +280,7 @@ SUBROUTINE CSYSV_RK( UPLO, N, NRHS, A, LDA, E, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysv_rook.c b/lapack-netlib/SRC/csysv_rook.c index e1411fa6ac..a55218a8a7 100644 --- a/lapack-netlib/SRC/csysv_rook.c +++ b/lapack-netlib/SRC/csysv_rook.c @@ -717,7 +717,7 @@ ook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csysv_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csysv_rook_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -725,7 +725,7 @@ ook.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int csytrf_rook_(char *, integer *, complex *, + extern /* Subroutine */ void csytrf_rook_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), csytrs_rook_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -787,9 +787,9 @@ ook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYSV_ROOK ", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -808,7 +808,7 @@ ook.f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYSV_ROOK */ diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f index daa9f27c41..aad594e21f 100644 --- a/lapack-netlib/SRC/csysv_rook.f +++ b/lapack-netlib/SRC/csysv_rook.f @@ -256,7 +256,7 @@ SUBROUTINE CSYSV_ROOK( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = REAL( WORK(1) ) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/csysvx.c b/lapack-netlib/SRC/csysvx.c index deca3f592d..42146a032a 100644 --- a/lapack-netlib/SRC/csysvx.c +++ b/lapack-netlib/SRC/csysvx.c @@ -797,7 +797,7 @@ f"> */ /* > \ingroup complexSYsolve */ /* ===================================================================== */ -/* Subroutine */ int csysvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void csysvx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer * ipiv, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *ferr, real *berr, complex *work, integer *lwork, real *rwork, @@ -813,14 +813,14 @@ f"> */ integer nb; extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern real clansy_(char *, char *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int csycon_(char *, integer *, complex *, integer + extern /* Subroutine */ void csycon_(char *, integer *, complex *, integer *, integer *, real *, real *, complex *, integer *), csyrfs_(char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *, complex *, integer *, complex *, @@ -829,7 +829,7 @@ f"> */ complex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int csytrs_(char *, integer *, integer *, complex + extern /* Subroutine */ void csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -909,9 +909,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYSVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } if (nofact) { @@ -926,7 +926,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -960,7 +960,7 @@ f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYSVX */ diff --git a/lapack-netlib/SRC/csysvxx.c b/lapack-netlib/SRC/csysvxx.c index 4027ed9718..5c0269d28f 100644 --- a/lapack-netlib/SRC/csysvxx.c +++ b/lapack-netlib/SRC/csysvxx.c @@ -1015,7 +1015,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexSYsolve */ /* ===================================================================== */ -/* Subroutine */ int csysvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void csysvxx_(char *fact, char *uplo, integer *n, integer * nrhs, complex *a, integer *lda, complex *af, integer *ldaf, integer * ipiv, char *equed, real *s, complex *b, integer *ldb, complex *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, integer * @@ -1038,17 +1038,17 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical equil, rcequ; extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int claqsy_(char *, integer *, complex *, integer + extern /* Subroutine */ void claqsy_(char *, integer *, complex *, integer *, real *, real *, real *, char *), csytrf_(char * , integer *, complex *, integer *, integer *, complex *, integer * , integer *); real smlnum; - extern /* Subroutine */ int clascl2_(integer *, integer *, real *, + extern /* Subroutine */ void clascl2_(integer *, integer *, real *, complex *, integer *), csytrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *), csyequb_(char *, integer *, complex *, integer *, real *, real *, real *, complex *, integer *), csyrfsx_(char *, @@ -1164,7 +1164,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CSYSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1209,7 +1209,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = cla_syrpvgrw_(uplo, n, info, &a[a_offset], lda, & af[af_offset], ldaf, &ipiv[1], &rwork[1]); } - return 0; + return; } } @@ -1241,7 +1241,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ clascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of CSYSVXX */ diff --git a/lapack-netlib/SRC/csyswapr.c b/lapack-netlib/SRC/csyswapr.c index 4a56c9ee53..69774cbc08 100644 --- a/lapack-netlib/SRC/csyswapr.c +++ b/lapack-netlib/SRC/csyswapr.c @@ -616,7 +616,7 @@ r.f"> */ /* > \ingroup complexSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int csyswapr_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csyswapr_(char *uplo, integer *n, complex *a, integer * lda, integer *i1, integer *i2) { /* System generated locals */ @@ -625,7 +625,7 @@ r.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; complex tmp; @@ -736,6 +736,6 @@ r.f"> */ } } - return 0; + return; } /* csyswapr_ */ diff --git a/lapack-netlib/SRC/csyswapr.f b/lapack-netlib/SRC/csyswapr.f index 185d819225..04004f3c11 100644 --- a/lapack-netlib/SRC/csyswapr.f +++ b/lapack-netlib/SRC/csyswapr.f @@ -58,15 +58,13 @@ *> \param[in,out] A *> \verbatim *> A is COMPLEX array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by CSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -116,7 +114,6 @@ SUBROUTINE CSYSWAPR( UPLO, N, A, LDA, I1, I2) * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I COMPLEX TMP * * .. External Functions .. @@ -143,19 +140,12 @@ SUBROUTINE CSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL CSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL CSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ SUBROUTINE CSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL CSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL CSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE CSYSWAPR diff --git a/lapack-netlib/SRC/csytf2.c b/lapack-netlib/SRC/csytf2.c index fc452299c8..eb6eeb4d01 100644 --- a/lapack-netlib/SRC/csytf2.c +++ b/lapack-netlib/SRC/csytf2.c @@ -706,7 +706,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytf2_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void csytf2_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -716,15 +716,15 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int csyr_(char *, integer *, complex *, complex *, + extern /* Subroutine */ void csyr_(char *, integer *, complex *, complex *, integer *, complex *, integer *); integer i__, j, k; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; logical upper; @@ -770,7 +770,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1285,7 +1285,7 @@ f"> */ } L70: - return 0; + return; /* End of CSYTF2 */ diff --git a/lapack-netlib/SRC/csytf2_rk.c b/lapack-netlib/SRC/csytf2_rk.c index ff2dc4208f..fc5beefa89 100644 --- a/lapack-netlib/SRC/csytf2_rk.c +++ b/lapack-netlib/SRC/csytf2_rk.c @@ -756,7 +756,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytf2_rk_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytf2_rk_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -767,16 +767,16 @@ rk.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int csyr_(char *, integer *, complex *, complex *, + extern /* Subroutine */ void csyr_(char *, integer *, complex *, complex *, integer *, complex *, integer *); integer i__, j, k, p; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); real sfmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp, kstep; real stemp; @@ -823,7 +823,7 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTF2_RK", &i__1, (ftnlen)9); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1681,7 +1681,7 @@ rk.f"> */ ; } - return 0; + return; /* End of CSYTF2_RK */ diff --git a/lapack-netlib/SRC/csytf2_rook.c b/lapack-netlib/SRC/csytf2_rook.c index 92aa52c515..9fab25bb3e 100644 --- a/lapack-netlib/SRC/csytf2_rook.c +++ b/lapack-netlib/SRC/csytf2_rook.c @@ -709,7 +709,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytf2_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void csytf2_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -720,16 +720,16 @@ rook.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int csyr_(char *, integer *, complex *, complex *, + extern /* Subroutine */ void csyr_(char *, integer *, complex *, complex *, integer *, complex *, integer *); integer i__, j, k, p; complex t; real alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); real sfmin; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer itemp, kstep; real stemp; @@ -775,7 +775,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTF2_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1524,7 +1524,7 @@ rook.f"> */ L70: - return 0; + return; /* End of CSYTF2_ROOK */ diff --git a/lapack-netlib/SRC/csytrf.c b/lapack-netlib/SRC/csytrf.c index 9a3276164a..56aef71daa 100644 --- a/lapack-netlib/SRC/csytrf.c +++ b/lapack-netlib/SRC/csytrf.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int csytrf_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void csytrf_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -708,13 +708,13 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int csytf2_(char *, integer *, complex *, integer + extern /* Subroutine */ void csytf2_(char *, integer *, complex *, integer *, integer *, integer *); integer kb, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int clasyf_(char *, integer *, integer *, integer + extern /* Subroutine */ void clasyf_(char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); integer ldwork, lwkopt; @@ -767,9 +767,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -896,7 +896,7 @@ f"> */ L40: work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYTRF */ diff --git a/lapack-netlib/SRC/csytrf_aa.c b/lapack-netlib/SRC/csytrf_aa.c index 75c8e85bdf..986295a520 100644 --- a/lapack-netlib/SRC/csytrf_aa.c +++ b/lapack-netlib/SRC/csytrf_aa.c @@ -648,7 +648,7 @@ aa.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytrf_aa_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytrf_aa_(char *uplo, integer *n, complex *a, integer * lda, integer *ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -657,12 +657,12 @@ aa.f"> */ /* Local variables */ integer j; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int clasyf_aa_(char *, integer *, integer *, + extern /* Subroutine */ void clasyf_aa_(char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, complex *), cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * @@ -728,19 +728,19 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRF_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } ipiv[1] = 1; if (*n == 1) { - return 0; + return; } /* Adjust block size based on the workspace size */ @@ -1034,7 +1034,7 @@ aa.f"> */ } L20: - return 0; + return; /* End of CSYTRF_AA */ diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.c b/lapack-netlib/SRC/csytrf_aa_2stage.c index b3539e9fa3..52f595dba1 100644 --- a/lapack-netlib/SRC/csytrf_aa_2stage.c +++ b/lapack-netlib/SRC/csytrf_aa_2stage.c @@ -675,7 +675,7 @@ aa_2stage.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytrf_aa_2stage_(char *uplo, integer *n, complex *a, +/* Subroutine */ void csytrf_aa_2stage_(char *uplo, integer *n, complex *a, integer *lda, complex *tb, integer *ltb, integer *ipiv, integer * ipiv2, complex *work, integer *lwork, integer *info) { @@ -685,12 +685,12 @@ aa_2stage.f"> */ /* Local variables */ integer ldtb, i__, j, k; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, @@ -698,13 +698,14 @@ aa_2stage.f"> */ integer i1; logical upper; integer i2, jb, kb, nb, td, nt; - extern /* Subroutine */ int cgbtrf_(integer *, integer *, integer *, - integer *, complex *, integer *, integer *, integer *), cgetrf_( - integer *, integer *, complex *, integer *, integer *, integer *), - clacpy_(char *, integer *, integer *, complex *, integer *, + extern /* Subroutine */ void cgbtrf_(integer *, integer *, integer *, + integer *, complex *, integer *, integer *, integer *); + extern int cgetrf_( + integer *, integer *, complex *, integer *, integer *, integer *); + extern void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, integer *, integer - *, complex *, complex *, complex *, integer *), xerbla_( - char *, integer *, ftnlen); + *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical tquery, wquery; @@ -752,7 +753,7 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRF_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Answer the query */ @@ -770,13 +771,13 @@ aa_2stage.f"> */ } } if (tquery || wquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } /* Determine the number of the block size */ @@ -1277,7 +1278,7 @@ aa_2stage.f"> */ /* Factor the band matrix */ cgbtrf_(n, n, &nb, &nb, &tb[1], &ldtb, &ipiv2[1], info); - return 0; + return; /* End of CSYTRF_AA_2STAGE */ diff --git a/lapack-netlib/SRC/csytrf_rk.c b/lapack-netlib/SRC/csytrf_rk.c index 44a7f2d3a0..e26e6f8fa7 100644 --- a/lapack-netlib/SRC/csytrf_rk.c +++ b/lapack-netlib/SRC/csytrf_rk.c @@ -774,7 +774,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytrf_rk_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytrf_rk_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, complex *work, integer *lwork, integer *info) { @@ -783,14 +783,14 @@ rk.f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int csytf2_rk_(char *, integer *, complex *, + extern /* Subroutine */ void csytf2_rk_(char *, integer *, complex *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmin, iinfo; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int clasyf_rk_(char *, integer *, integer *, + extern /* Subroutine */ void clasyf_rk_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer kb, nb, ip; @@ -848,9 +848,9 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRF_RK", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -1036,7 +1036,7 @@ rk.f"> */ } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYTRF_RK */ diff --git a/lapack-netlib/SRC/csytrf_rook.c b/lapack-netlib/SRC/csytrf_rook.c index 97d75cc8c9..fbff30524d 100644 --- a/lapack-netlib/SRC/csytrf_rook.c +++ b/lapack-netlib/SRC/csytrf_rook.c @@ -723,7 +723,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytrf_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void csytrf_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *lwork, integer * info) { @@ -742,7 +742,7 @@ rook.f"> */ integer ldwork, lwkopt; logical lquery; integer iws; - extern /* Subroutine */ int csytf2_rook_(char *, integer *, complex *, + extern /* Subroutine */ void csytf2_rook_(char *, integer *, complex *, integer *, integer *, integer *), clasyf_rook_(char *, integer *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); @@ -795,9 +795,9 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRF_ROOK", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -927,7 +927,7 @@ rook.f"> */ L40: work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYTRF_ROOK */ diff --git a/lapack-netlib/SRC/csytri.c b/lapack-netlib/SRC/csytri.c index 751c1f2b2f..f7a9ac3fb1 100644 --- a/lapack-netlib/SRC/csytri.c +++ b/lapack-netlib/SRC/csytri.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytri_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void csytri_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *info) { /* System generated locals */ @@ -641,15 +641,15 @@ f"> */ integer k; complex t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int csymv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void csymv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ); complex ak; @@ -689,13 +689,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -707,7 +707,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -719,7 +719,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -1032,7 +1032,7 @@ f"> */ ; } - return 0; + return; /* End of CSYTRI */ diff --git a/lapack-netlib/SRC/csytri2.c b/lapack-netlib/SRC/csytri2.c index eaff83aa1f..d20e8fe1c9 100644 --- a/lapack-netlib/SRC/csytri2.c +++ b/lapack-netlib/SRC/csytri2.c @@ -641,14 +641,14 @@ static integer c_n1 = -1; /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytri2_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytri2_(char *uplo, integer *n, complex *a, integer * lda, integer *ipiv, complex *work, integer *lwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int csytri2x_(char *, integer *, complex *, + extern /* Subroutine */ void csytri2x_(char *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmax; @@ -656,7 +656,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int csytri_(char *, integer *, complex *, integer + extern /* Subroutine */ void csytri_(char *, integer *, complex *, integer *, integer *, complex *, integer *); logical lquery; integer minsize; @@ -709,13 +709,13 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRI2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { work[1].r = (real) minsize, work[1].i = 0.f; - return 0; + return; } if (*n == 0) { - return 0; + return; } if (nbmax >= *n) { csytri_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], info); @@ -723,7 +723,7 @@ static integer c_n1 = -1; csytri2x_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], &nbmax, info); } - return 0; + return; /* End of CSYTRI2 */ diff --git a/lapack-netlib/SRC/csytri2x.c b/lapack-netlib/SRC/csytri2x.c index 1444c3e603..2e0083ccbb 100644 --- a/lapack-netlib/SRC/csytri2x.c +++ b/lapack-netlib/SRC/csytri2x.c @@ -634,7 +634,7 @@ x.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytri2x_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytri2x_(char *uplo, integer *n, complex *a, integer * lda, integer *ipiv, complex *work, integer *nb, integer *info) { /* System generated locals */ @@ -645,17 +645,17 @@ x.f"> */ /* Local variables */ integer invd; complex akkp1; - extern /* Subroutine */ int csyswapr_(char *, integer *, complex *, + extern /* Subroutine */ void csyswapr_(char *, integer *, complex *, integer *, integer *, integer *); complex d__; integer i__, j, k; complex t; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer count; @@ -664,11 +664,12 @@ x.f"> */ integer u11; complex u11_i_j__; integer ip; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ctrtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int ctrtri_( char *, char *, integer *, complex *, integer *, integer *); integer nnb, cut; complex akp1; - extern /* Subroutine */ int csyconv_(char *, char *, integer *, complex *, + extern /* Subroutine */ void csyconv_(char *, char *, integer *, complex *, integer *, integer *, complex *, integer *); complex u01_ip1_j__, u11_ip1_j__; @@ -710,10 +711,10 @@ x.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRI2X", &i__1, (ftnlen)8); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Convert A */ @@ -731,7 +732,7 @@ x.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } } } else { @@ -742,7 +743,7 @@ x.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } } } @@ -1373,7 +1374,7 @@ x.f"> */ } } - return 0; + return; /* End of CSYTRI2X */ diff --git a/lapack-netlib/SRC/csytri_3.c b/lapack-netlib/SRC/csytri_3.c index db99057e30..2622d4c450 100644 --- a/lapack-netlib/SRC/csytri_3.c +++ b/lapack-netlib/SRC/csytri_3.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytri_3_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytri_3_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, complex *work, integer *lwork, integer *info) { @@ -692,7 +692,7 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int csytri_3x_(char *, integer *, complex *, + extern /* Subroutine */ void csytri_3x_(char *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); logical upper; integer nb; @@ -748,16 +748,16 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRI_3", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } csytri_3x_(uplo, n, &a[a_offset], lda, &e[1], &ipiv[1], &work[1], &nb, @@ -765,7 +765,7 @@ static integer c_n1 = -1; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CSYTRI_3 */ diff --git a/lapack-netlib/SRC/csytri_3x.c b/lapack-netlib/SRC/csytri_3x.c index 83a868acb1..6d543a5a34 100644 --- a/lapack-netlib/SRC/csytri_3x.c +++ b/lapack-netlib/SRC/csytri_3x.c @@ -673,7 +673,7 @@ static complex c_b2 = {0.f,0.f}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytri_3x_(char *uplo, integer *n, complex *a, integer * +/* Subroutine */ void csytri_3x_(char *uplo, integer *n, complex *a, integer * lda, complex *e, integer *ipiv, complex *work, integer *nb, integer * info) { @@ -685,16 +685,16 @@ static complex c_b2 = {0.f,0.f}; /* Local variables */ integer invd; complex akkp1; - extern /* Subroutine */ int csyswapr_(char *, integer *, complex *, + extern /* Subroutine */ void csyswapr_(char *, integer *, complex *, integer *, integer *, integer *); complex d__; integer i__, j, k; complex t; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; @@ -747,10 +747,10 @@ static complex c_b2 = {0.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRI_3X", &i__1, (ftnlen)9); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Workspace got Non-diag elements of D */ @@ -771,7 +771,7 @@ static complex c_b2 = {0.f,0.f}; for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } } } else { @@ -782,7 +782,7 @@ static complex c_b2 = {0.f,0.f}; for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } } } @@ -1422,7 +1422,7 @@ static complex c_b2 = {0.f,0.f}; } - return 0; + return; /* End of CSYTRI_3X */ diff --git a/lapack-netlib/SRC/csytri_rook.c b/lapack-netlib/SRC/csytri_rook.c index 4759f866e8..9718a082a0 100644 --- a/lapack-netlib/SRC/csytri_rook.c +++ b/lapack-netlib/SRC/csytri_rook.c @@ -644,7 +644,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytri_rook_(char *uplo, integer *n, complex *a, +/* Subroutine */ void csytri_rook_(char *uplo, integer *n, complex *a, integer *lda, integer *ipiv, complex *work, integer *info) { /* System generated locals */ @@ -656,15 +656,15 @@ rook.f"> */ integer k; complex t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); extern /* Complex */ VOID cdotu_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int csymv_(char *, integer *, complex *, complex * + extern /* Subroutine */ void csymv_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, complex *, integer * ); complex ak; @@ -704,13 +704,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRI_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -722,7 +722,7 @@ rook.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0.f && a[i__1].i == 0.f)) { - return 0; + return; } /* L10: */ } @@ -734,7 +734,7 @@ rook.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0.f && a[i__2].i == 0.f)) { - return 0; + return; } /* L20: */ } @@ -1137,7 +1137,7 @@ rook.f"> */ ; } - return 0; + return; /* End of CSYTRI_ROOK */ diff --git a/lapack-netlib/SRC/csytrs.c b/lapack-netlib/SRC/csytrs.c index e15ad31f69..7a51af61f4 100644 --- a/lapack-netlib/SRC/csytrs.c +++ b/lapack-netlib/SRC/csytrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytrs_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void csytrs_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, integer *ipiv, complex *b, integer *ldb, integer * info) { @@ -645,11 +645,11 @@ f"> */ /* Local variables */ complex akm1k; integer j, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -696,13 +696,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1050,7 +1050,7 @@ f"> */ ; } - return 0; + return; /* End of CSYTRS */ diff --git a/lapack-netlib/SRC/csytrs2.c b/lapack-netlib/SRC/csytrs2.c index 2c67ba4df7..8b100db3be 100644 --- a/lapack-netlib/SRC/csytrs2.c +++ b/lapack-netlib/SRC/csytrs2.c @@ -644,7 +644,7 @@ static complex c_b1 = {1.f,0.f}; /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytrs2_(char *uplo, integer *n, integer *nrhs, complex * +/* Subroutine */ void csytrs2_(char *uplo, integer *n, integer *nrhs, complex * a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex * work, integer *info) { @@ -655,12 +655,12 @@ static complex c_b1 = {1.f,0.f}; /* Local variables */ complex akm1k; integer i__, j, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); complex denom; integer iinfo; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); @@ -669,7 +669,7 @@ static complex c_b1 = {1.f,0.f}; integer kp; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); complex akm1, bkm1; - extern /* Subroutine */ int csyconv_(char *, char *, integer *, complex *, + extern /* Subroutine */ void csyconv_(char *, char *, integer *, complex *, integer *, integer *, complex *, integer *); @@ -709,13 +709,13 @@ static complex c_b1 = {1.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRS2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Convert A */ @@ -936,7 +936,7 @@ static complex c_b1 = {1.f,0.f}; csyconv_(uplo, "R", n, &a[a_offset], lda, &ipiv[1], &work[1], &iinfo); - return 0; + return; /* End of CSYTRS2 */ diff --git a/lapack-netlib/SRC/csytrs_3.c b/lapack-netlib/SRC/csytrs_3.c index f5c096769b..739718c1ce 100644 --- a/lapack-netlib/SRC/csytrs_3.c +++ b/lapack-netlib/SRC/csytrs_3.c @@ -677,7 +677,7 @@ static complex c_b1 = {1.f,0.f}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytrs_3_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csytrs_3_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *e, integer *ipiv, complex *b, integer *ldb, integer *info) { @@ -688,11 +688,11 @@ static complex c_b1 = {1.f,0.f}; /* Local variables */ complex akm1k; integer i__, j, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); @@ -739,13 +739,13 @@ static complex c_b1 = {1.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRS_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -933,7 +933,7 @@ static complex c_b1 = {1.f,0.f}; } - return 0; + return; /* End of CSYTRS_3 */ diff --git a/lapack-netlib/SRC/csytrs_aa.c b/lapack-netlib/SRC/csytrs_aa.c index 8a7c4ec93d..719a494a69 100644 --- a/lapack-netlib/SRC/csytrs_aa.c +++ b/lapack-netlib/SRC/csytrs_aa.c @@ -644,7 +644,7 @@ aa.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytrs_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csytrs_aa_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, complex *work, integer *lwork, integer *info) { @@ -654,16 +654,16 @@ aa.f"> */ /* Local variables */ integer k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cswap_(integer *, complex *, integer *, + extern /* Subroutine */ void cswap_(integer *, complex *, integer *, complex *, integer *), cgtsv_(integer *, integer *, complex *, complex *, complex *, complex *, integer *, integer *), ctrsm_( char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; integer kp; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -712,17 +712,17 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRS_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { lwkopt = *n * 3 - 2; work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -855,7 +855,7 @@ aa.f"> */ } - return 0; + return; /* End of CSYTRS_AA */ diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.c b/lapack-netlib/SRC/csytrs_aa_2stage.c index 6b0d725138..fedd0e4688 100644 --- a/lapack-netlib/SRC/csytrs_aa_2stage.c +++ b/lapack-netlib/SRC/csytrs_aa_2stage.c @@ -653,7 +653,7 @@ aa_2stage.f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int csytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, complex *tb, integer *ltb, integer *ipiv, integer *ipiv2, complex *b, integer *ldb, integer *info) { @@ -663,15 +663,16 @@ aa_2stage.f"> */ /* Local variables */ integer ldtb; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; integer nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cgbtrs_( char *, integer *, integer *, integer *, integer *, complex *, - integer *, integer *, complex *, integer *, integer *), - claswp_(integer *, complex *, integer *, integer *, integer *, + integer *, integer *, complex *, integer *, integer *); + extern int claswp_(integer *, complex *, integer *, integer *, integer *, integer *, integer *); @@ -715,13 +716,13 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRS_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Read NB and compute LDTB */ @@ -806,7 +807,7 @@ aa_2stage.f"> */ } } - return 0; + return; /* End of CSYTRS_AA_2STAGE */ diff --git a/lapack-netlib/SRC/csytrs_rook.c b/lapack-netlib/SRC/csytrs_rook.c index a0ed21b946..f80623ef24 100644 --- a/lapack-netlib/SRC/csytrs_rook.c +++ b/lapack-netlib/SRC/csytrs_rook.c @@ -649,7 +649,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int csytrs_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void csytrs_rook_(char *uplo, integer *n, integer *nrhs, complex *a, integer *lda, integer *ipiv, complex *b, integer *ldb, integer *info) { @@ -660,11 +660,11 @@ rook.f"> */ /* Local variables */ complex akm1k; integer j, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); complex denom; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), cgeru_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), @@ -711,13 +711,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CSYTRS_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1092,7 +1092,7 @@ rook.f"> */ ; } - return 0; + return; /* End of CSYTRS_ROOK */ diff --git a/lapack-netlib/SRC/ctbcon.c b/lapack-netlib/SRC/ctbcon.c index f51938fe55..effcb30a06 100644 --- a/lapack-netlib/SRC/ctbcon.c +++ b/lapack-netlib/SRC/ctbcon.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctbcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void ctbcon_(char *norm, char *uplo, char *diag, integer *n, integer *kd, complex *ab, integer *ldab, real *rcond, complex *work, real *rwork, integer *info) { @@ -669,19 +669,19 @@ f"> */ integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xnorm; integer ix; extern integer icamax_(integer *, complex *, integer *); extern real clantb_(char *, char *, char *, integer *, integer *, complex *, integer *, real *), slamch_(char *); - extern /* Subroutine */ int clatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatbs_(char *, char *, char *, char *, integer *, integer *, complex *, integer *, complex *, real *, - real *, integer *), xerbla_(char * - , integer *, ftnlen); + real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + extern /* Subroutine */ void csrscl_(integer *, real *, complex *, integer *); logical onenrm; char normin[1]; @@ -729,14 +729,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.f; - return 0; + return; } *rcond = 0.f; @@ -801,7 +801,7 @@ f"> */ } L20: - return 0; + return; /* End of CTBCON */ diff --git a/lapack-netlib/SRC/ctbrfs.c b/lapack-netlib/SRC/ctbrfs.c index 47450cf9bd..08616fb372 100644 --- a/lapack-netlib/SRC/ctbrfs.c +++ b/lapack-netlib/SRC/ctbrfs.c @@ -700,7 +700,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctbrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ctbrfs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, complex *ab, integer *ldab, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -718,13 +718,13 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ctbmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctbmv_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *), ccopy_(integer *, complex *, integer *, complex * , integer *), ctbsv_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xk; extern real slamch_(char *); @@ -792,7 +792,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -804,7 +804,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1149,7 +1149,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of CTBRFS */ diff --git a/lapack-netlib/SRC/ctbtrs.c b/lapack-netlib/SRC/ctbtrs.c index fa7dfa5442..560c1efef4 100644 --- a/lapack-netlib/SRC/ctbtrs.c +++ b/lapack-netlib/SRC/ctbtrs.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctbtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ctbtrs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, complex *ab, integer *ldab, complex *b, integer *ldb, integer *info) { @@ -668,7 +668,7 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctbsv_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *); logical upper; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -719,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -736,7 +736,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *kd + 1 + *info * ab_dim1; if (ab[i__2].r == 0.f && ab[i__2].i == 0.f) { - return 0; + return; } /* L10: */ } @@ -745,7 +745,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info * ab_dim1 + 1; if (ab[i__2].r == 0.f && ab[i__2].i == 0.f) { - return 0; + return; } /* L20: */ } @@ -762,7 +762,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of CTBTRS */ diff --git a/lapack-netlib/SRC/ctfsm.c b/lapack-netlib/SRC/ctfsm.c index d939b67795..695dbcff04 100644 --- a/lapack-netlib/SRC/ctfsm.c +++ b/lapack-netlib/SRC/ctfsm.c @@ -810,7 +810,7 @@ static complex c_b1 = {1.f,0.f}; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctfsm_(char *transr, char *side, char *uplo, char *trans, +/* Subroutine */ void ctfsm_(char *transr, char *side, char *uplo, char *trans, char *diag, integer *m, integer *n, complex *alpha, complex *a, complex *b, integer *ldb) { @@ -821,13 +821,13 @@ static complex c_b1 = {1.f,0.f}; /* Local variables */ integer info, i__, j, k; logical normaltransr; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); logical lside; extern logical lsame_(char *, char *); logical lower; - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer m1, m2, n1, n2; @@ -877,13 +877,13 @@ static complex c_b1 = {1.f,0.f}; if (info != 0) { i__1 = -info; xerbla_("CTFSM ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return when ( (N.EQ.0).OR.(M.EQ.0) ) */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Quick return when ALPHA.EQ.(0E+0,0E+0) */ @@ -899,7 +899,7 @@ static complex c_b1 = {1.f,0.f}; } /* L20: */ } - return 0; + return; } if (lside) { @@ -1580,7 +1580,7 @@ static complex c_b1 = {1.f,0.f}; } } - return 0; + return; /* End of CTFSM */ diff --git a/lapack-netlib/SRC/ctftri.c b/lapack-netlib/SRC/ctftri.c index 92220ad6c1..adef1cab9f 100644 --- a/lapack-netlib/SRC/ctftri.c +++ b/lapack-netlib/SRC/ctftri.c @@ -734,7 +734,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctftri_(char *transr, char *uplo, char *diag, integer *n, +/* Subroutine */ void ctftri_(char *transr, char *uplo, char *diag, integer *n, complex *a, integer *info) { /* System generated locals */ @@ -745,7 +745,7 @@ f"> */ integer k; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical lower; @@ -783,13 +783,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -831,7 +831,7 @@ f"> */ ctrtri_("L", diag, &n1, a, n, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; ctrmm_("R", "L", "N", diag, &n2, &n1, &q__1, a, n, &a[n1], n); @@ -841,7 +841,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ctrmm_("L", "U", "C", diag, &n2, &n1, &c_b1, &a[*n], n, &a[n1] , n); @@ -855,7 +855,7 @@ f"> */ ctrtri_("L", diag, &n1, &a[n2], n, info) ; if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; ctrmm_("L", "L", "C", diag, &n1, &n2, &q__1, &a[n2], n, a, n); @@ -865,7 +865,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ctrmm_("R", "U", "N", diag, &n1, &n2, &c_b1, &a[n1], n, a, n); @@ -882,7 +882,7 @@ f"> */ ctrtri_("U", diag, &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; ctrmm_("L", "U", "N", diag, &n1, &n2, &q__1, a, &n1, &a[n1 * @@ -892,7 +892,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ctrmm_("R", "L", "C", diag, &n1, &n2, &c_b1, &a[1], &n1, &a[ n1 * n1], &n1); @@ -904,7 +904,7 @@ f"> */ ctrtri_("U", diag, &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; ctrmm_("R", "U", "C", diag, &n2, &n1, &q__1, &a[n2 * n2], &n2, @@ -914,7 +914,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ctrmm_("L", "L", "N", diag, &n2, &n1, &c_b1, &a[n1 * n2], &n2, a, &n2); @@ -939,7 +939,7 @@ f"> */ i__1 = *n + 1; ctrtri_("L", diag, &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; i__1 = *n + 1; @@ -952,7 +952,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -968,7 +968,7 @@ f"> */ i__1 = *n + 1; ctrtri_("L", diag, &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; i__1 = *n + 1; @@ -981,7 +981,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -1000,7 +1000,7 @@ f"> */ ctrtri_("U", diag, &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; ctrmm_("L", "U", "N", diag, &k, &k, &q__1, &a[k], &k, &a[k * ( @@ -1010,7 +1010,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } ctrmm_("R", "L", "C", diag, &k, &k, &c_b1, a, &k, &a[k * (k + 1)], &k); @@ -1022,7 +1022,7 @@ f"> */ ctrtri_("U", diag, &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } q__1.r = -1.f, q__1.i = 0.f; ctrmm_("R", "U", "C", diag, &k, &k, &q__1, &a[k * (k + 1)], & @@ -1032,7 +1032,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } ctrmm_("L", "L", "N", diag, &k, &k, &c_b1, &a[k * k], &k, a, & k); @@ -1040,7 +1040,7 @@ f"> */ } } - return 0; + return; /* End of CTFTRI */ diff --git a/lapack-netlib/SRC/ctfttp.c b/lapack-netlib/SRC/ctfttp.c index d1cfc54a97..11e5be640b 100644 --- a/lapack-netlib/SRC/ctfttp.c +++ b/lapack-netlib/SRC/ctfttp.c @@ -718,7 +718,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctfttp_(char *transr, char *uplo, integer *n, complex * +/* Subroutine */ void ctfttp_(char *transr, char *uplo, integer *n, complex * arf, complex *ap, integer *info) { /* System generated locals */ @@ -760,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTFTTP", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -776,7 +776,7 @@ f"> */ r_cnjg(&q__1, arf); ap[0].r = q__1.r, ap[0].i = q__1.i; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1112,7 +1112,7 @@ f"> */ } - return 0; + return; /* End of CTFTTP */ diff --git a/lapack-netlib/SRC/ctfttr.c b/lapack-netlib/SRC/ctfttr.c index 59452eccf6..9afa0ef872 100644 --- a/lapack-netlib/SRC/ctfttr.c +++ b/lapack-netlib/SRC/ctfttr.c @@ -726,7 +726,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctfttr_(char *transr, char *uplo, integer *n, complex * +/* Subroutine */ void ctfttr_(char *transr, char *uplo, integer *n, complex * arf, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -776,7 +776,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTFTTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -790,7 +790,7 @@ f"> */ a[0].r = q__1.r, a[0].i = q__1.i; } } - return 0; + return; } /* Size of array ARF(1:2,0:nt-1) */ @@ -1122,7 +1122,7 @@ f"> */ } - return 0; + return; /* End of CTFTTR */ diff --git a/lapack-netlib/SRC/ctgevc.c b/lapack-netlib/SRC/ctgevc.c index 76676b03bd..fa36f7e9cf 100644 --- a/lapack-netlib/SRC/ctgevc.c +++ b/lapack-netlib/SRC/ctgevc.c @@ -733,7 +733,7 @@ f"> */ /* > \ingroup complexGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctgevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void ctgevc_(char *side, char *howmny, logical *select, integer *n, complex *s, integer *lds, complex *p, integer *ldp, complex *vl, integer *ldvl, complex *vr, integer *ldvr, integer *mm, integer *m, complex *work, real *rwork, integer *info) @@ -758,7 +758,7 @@ f"> */ integer iside; real sbeta; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real small; @@ -773,7 +773,7 @@ f"> */ complex bcoeff; logical ilback; integer im; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real ascale, bscale; integer jr; extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); @@ -867,7 +867,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Count the number of eigenvectors */ @@ -908,14 +908,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = im; if (*n == 0) { - return 0; + return; } /* Machine Constants */ @@ -1544,7 +1544,7 @@ f"> */ } } - return 0; + return; /* End of CTGEVC */ diff --git a/lapack-netlib/SRC/ctgex2.c b/lapack-netlib/SRC/ctgex2.c index fa60909b1b..bfd6afea04 100644 --- a/lapack-netlib/SRC/ctgex2.c +++ b/lapack-netlib/SRC/ctgex2.c @@ -704,7 +704,7 @@ f"> */ /* > Numerical Algorithms, 1996. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctgex2_(logical *wantq, logical *wantz, integer *n, +/* Subroutine */ void ctgex2_(logical *wantq, logical *wantz, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *q, integer *ldq, complex *z__, integer *ldz, integer *j1, integer *info) { @@ -717,7 +717,7 @@ f"> */ /* Local variables */ logical weak; complex cdum; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); complex work[8], f, g; integer i__, m; @@ -727,11 +727,11 @@ f"> */ real ss; extern real slamch_(char *); real ws; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), clartg_(complex *, complex *, real *, complex *, complex *); complex sz; - extern /* Subroutine */ int classq_(integer *, complex *, integer *, real + extern /* Subroutine */ void classq_(integer *, complex *, integer *, real *, real *); real thresh, smlnum; logical strong; @@ -767,7 +767,7 @@ f"> */ /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } m = 2; @@ -938,13 +938,13 @@ f"> */ /* Exit with INFO = 0 if swap was successfully performed. */ - return 0; + return; /* Exit with INFO = 1 if swap was rejected. */ L20: *info = 1; - return 0; + return; /* End of CTGEX2 */ diff --git a/lapack-netlib/SRC/ctgexc.c b/lapack-netlib/SRC/ctgexc.c index 025fbbb917..75ca678601 100644 --- a/lapack-netlib/SRC/ctgexc.c +++ b/lapack-netlib/SRC/ctgexc.c @@ -708,7 +708,7 @@ f"> */ /* > 1996. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctgexc_(logical *wantq, logical *wantz, integer *n, +/* Subroutine */ void ctgexc_(logical *wantq, logical *wantz, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *q, integer *ldq, complex *z__, integer *ldz, integer *ifst, integer * ilst, integer *info) @@ -719,10 +719,10 @@ f"> */ /* Local variables */ integer here; - extern /* Subroutine */ int ctgex2_(logical *, logical *, integer *, + extern /* Subroutine */ void ctgex2_(logical *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, - complex *, integer *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + complex *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.1) -- */ @@ -769,16 +769,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGEXC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } if (*ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -793,7 +793,7 @@ f"> */ q_offset], ldq, &z__[z_offset], ldz, &here, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; if (here < *ilst) { @@ -811,7 +811,7 @@ f"> */ q_offset], ldq, &z__[z_offset], ldz, &here, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; if (here >= *ilst) { @@ -820,7 +820,7 @@ f"> */ ++here; } *ilst = here; - return 0; + return; /* End of CTGEXC */ diff --git a/lapack-netlib/SRC/ctgsen.c b/lapack-netlib/SRC/ctgsen.c index c853bcbbd4..f5c3f6a9f3 100644 --- a/lapack-netlib/SRC/ctgsen.c +++ b/lapack-netlib/SRC/ctgsen.c @@ -944,7 +944,7 @@ f"> */ /* > 1996. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctgsen_(integer *ijob, logical *wantq, logical *wantz, +/* Subroutine */ void ctgsen_(integer *ijob, logical *wantq, logical *wantz, logical *select, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *alpha, complex *beta, complex *q, integer *ldq, complex *z__, integer *ldz, integer *m, real *pl, real *pr, real * @@ -962,30 +962,31 @@ f"> */ logical swap; complex temp1, temp2; integer i__, k; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); integer isave[3]; logical wantd; integer lwmin; logical wantp; integer n1, n2; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); logical wantd1, wantd2; real dscale; integer ks; extern real slamch_(char *); real rdscal; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); real safmin; - extern /* Subroutine */ int ctgexc_(logical *, logical *, integer *, + extern /* Subroutine */ void ctgexc_(logical *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, - complex *, integer *, integer *, integer *, integer *), xerbla_( - char *, integer *, ftnlen), classq_(integer *, complex *, integer + complex *, integer *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void classq_(integer *, complex *, integer *, real *, real *); integer liwmin; - extern /* Subroutine */ int ctgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void ctgsyl_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, integer *, integer *, integer *); @@ -1046,7 +1047,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGSEN", &i__1, (ftnlen)6); - return 0; + return; } ierr = 0; @@ -1114,9 +1115,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -1371,7 +1372,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; iwork[1] = liwmin; - return 0; + return; /* End of CTGSEN */ diff --git a/lapack-netlib/SRC/ctgsja.c b/lapack-netlib/SRC/ctgsja.c index c5a56c48ed..c968c9e7fc 100644 --- a/lapack-netlib/SRC/ctgsja.c +++ b/lapack-netlib/SRC/ctgsja.c @@ -896,7 +896,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctgsja_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void ctgsja_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, integer *k, integer *l, complex *a, integer * lda, complex *b, integer *ldb, real *tola, real *tolb, real *alpha, real *beta, complex *u, integer *ldu, complex *v, integer *ldv, @@ -910,12 +910,12 @@ f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer kcallmycycle, i__, j; real gamma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical initq; real a1, a3, b1; @@ -924,13 +924,14 @@ f"> */ logical wantu, wantv; real ssmin; complex a2, b2; - extern /* Subroutine */ int clags2_(logical *, real *, complex *, real *, + extern /* Subroutine */ void clags2_(logical *, real *, complex *, real *, real *, complex *, real *, real *, complex *, real *, complex *, real *, complex *), clapll_(integer *, complex *, integer *, complex *, integer *, real *), csscal_(integer *, real *, complex *, integer *), claset_(char *, integer *, integer *, complex *, - complex *, complex *, integer *), xerbla_(char *, integer - *, ftnlen), slartg_(real *, real *, real *, real *, real *); + complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slartg_(real *, real *, real *, real *, real *); // extern integer myhuge_(real *); real csq, csu, csv; complex snq; @@ -1009,7 +1010,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGSJA", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize U, V and Q, if necessary */ @@ -1285,7 +1286,7 @@ f"> */ L100: *ncallmycycle = kcallmycycle; - return 0; + return; /* End of CTGSJA */ diff --git a/lapack-netlib/SRC/ctgsna.c b/lapack-netlib/SRC/ctgsna.c index 19a785228f..f11f2cd568 100644 --- a/lapack-netlib/SRC/ctgsna.c +++ b/lapack-netlib/SRC/ctgsna.c @@ -826,7 +826,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctgsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void ctgsna_(char *job, char *howmny, logical *select, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *vl, integer *ldvl, complex *vr, integer *ldvr, real *s, real *dif, integer *mm, integer *m, complex *work, integer *lwork, integer @@ -850,7 +850,7 @@ f"> */ extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer lwmin; @@ -860,17 +860,18 @@ f"> */ extern real scnrm2_(integer *, complex *, integer *), slapy2_(real *, real *); complex dummy1[1]; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ks; extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), ctgexc_(logical *, logical *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; logical wantbh, wantdf, somcon; - extern /* Subroutine */ int ctgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void ctgsyl_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, integer *, integer *, integer *); @@ -971,15 +972,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGSNA", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1079,7 +1080,7 @@ f"> */ ; } work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CTGSNA */ diff --git a/lapack-netlib/SRC/ctgsy2.c b/lapack-netlib/SRC/ctgsy2.c index 82d8cb5a1e..f3089d44b9 100644 --- a/lapack-netlib/SRC/ctgsy2.c +++ b/lapack-netlib/SRC/ctgsy2.c @@ -771,7 +771,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int ctgsy2_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void ctgsy2_(char *trans, integer *ijob, integer *m, integer * n, complex *a, integer *lda, complex *b, integer *ldb, complex *c__, integer *ldc, complex *d__, integer *ldd, complex *e, integer *lde, complex *f, integer *ldf, real *scale, real *rdsum, real *rdscal, @@ -786,11 +786,11 @@ f"> */ /* Local variables */ integer ierr, ipiv[2], jpiv[2], i__, j, k; complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); complex z__[4] /* was [2][2] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), cgesc2_(integer *, complex *, integer *, complex *, integer *, integer *, real *), cgetc2_( integer *, complex *, integer *, integer *, integer *, integer *), @@ -866,7 +866,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGSY2", &i__1, (ftnlen)6); - return 0; + return; } if (notran) { @@ -1057,7 +1057,7 @@ f"> */ /* L80: */ } } - return 0; + return; /* End of CTGSY2 */ diff --git a/lapack-netlib/SRC/ctgsyl.c b/lapack-netlib/SRC/ctgsyl.c index 0db1578c99..0fb5e88ca4 100644 --- a/lapack-netlib/SRC/ctgsyl.c +++ b/lapack-netlib/SRC/ctgsyl.c @@ -812,7 +812,7 @@ f"> */ /* > July 1989, pp 745-751. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctgsyl_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void ctgsyl_(char *trans, integer *ijob, integer *m, integer * n, complex *a, integer *lda, complex *b, integer *ldb, complex *c__, integer *ldc, complex *d__, integer *ldd, complex *e, integer *lde, complex *f, integer *ldf, real *scale, real *dif, complex *work, @@ -827,14 +827,14 @@ f"> */ /* Local variables */ real dsum; integer i__, j, k, p, q; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), cgemm_(char *, char *, integer *, integer *, integer * , complex *, complex *, integer *, complex *, integer *, complex * , complex *, integer *); extern logical lsame_(char *, char *); integer ifunc, linfo, lwmin; real scale2; - extern /* Subroutine */ int ctgsy2_(char *, integer *, integer *, integer + extern /* Subroutine */ void ctgsy2_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, real *, integer *); @@ -842,9 +842,10 @@ f"> */ real dscale; integer is, js, pq; real scaloc; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *), claset_(char *, - integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, complex *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer iround; @@ -942,9 +943,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTGSYL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -956,7 +957,7 @@ f"> */ *dif = 0.f; } } - return 0; + return; } /* Determine optimal block sizes MB and NB */ @@ -1018,7 +1019,7 @@ f"> */ /* L30: */ } - return 0; + return; } @@ -1288,7 +1289,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CTGSYL */ diff --git a/lapack-netlib/SRC/ctpcon.c b/lapack-netlib/SRC/ctpcon.c index da0c664db7..0db65c2fe3 100644 --- a/lapack-netlib/SRC/ctpcon.c +++ b/lapack-netlib/SRC/ctpcon.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctpcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void ctpcon_(char *norm, char *uplo, char *diag, integer *n, complex *ap, real *rcond, complex *work, real *rwork, integer *info) { /* System generated locals */ @@ -656,7 +656,7 @@ f"> */ integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xnorm; integer ix; @@ -664,10 +664,10 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real clantp_(char *, char *, char *, integer *, complex *, real *); - extern /* Subroutine */ int clatps_(char *, char *, char *, char *, + extern /* Subroutine */ void clatps_(char *, char *, char *, char *, integer *, complex *, complex *, real *, real *, integer *); real ainvnm; - extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer + extern /* Subroutine */ void csrscl_(integer *, real *, complex *, integer *); logical onenrm; char normin[1]; @@ -709,14 +709,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.f; - return 0; + return; } *rcond = 0.f; @@ -781,7 +781,7 @@ f"> */ } L20: - return 0; + return; /* End of CTPCON */ diff --git a/lapack-netlib/SRC/ctplqt.c b/lapack-netlib/SRC/ctplqt.c index dc9864bd0e..2fb820e754 100644 --- a/lapack-netlib/SRC/ctplqt.c +++ b/lapack-netlib/SRC/ctplqt.c @@ -679,7 +679,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctplqt_(integer *m, integer *n, integer *l, integer *mb, +/* Subroutine */ void ctplqt_(integer *m, integer *n, integer *l, integer *mb, complex *a, integer *lda, complex *b, integer *ldb, complex *t, integer *ldt, complex *work, integer *info) { @@ -689,7 +689,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ integer i__, iinfo, ib, lb, nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ctprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ctprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *), ctplqt2_(integer *, integer *, integer *, @@ -740,13 +741,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CTPLQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *m; @@ -780,7 +781,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ a_dim1], lda, &b[i__ + ib + b_dim1], ldb, &work[1], &i__4); } } - return 0; + return; /* End of CTPLQT */ diff --git a/lapack-netlib/SRC/ctplqt2.c b/lapack-netlib/SRC/ctplqt2.c index f9014f1808..78eaea4874 100644 --- a/lapack-netlib/SRC/ctplqt2.c +++ b/lapack-netlib/SRC/ctplqt2.c @@ -673,7 +673,7 @@ static complex c_b2 = {1.f,0.f}; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctplqt2_(integer *m, integer *n, integer *l, complex *a, +/* Subroutine */ void ctplqt2_(integer *m, integer *n, integer *l, complex *a, integer *lda, complex *b, integer *ldb, complex *t, integer *ldt, integer *info) { @@ -684,16 +684,17 @@ static complex c_b2 = {1.f,0.f}; /* Local variables */ integer i__, j, p; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); integer mp, np; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.1) -- */ @@ -736,13 +737,13 @@ static complex c_b2 = {1.f,0.f}; if (*info != 0) { i__1 = -(*info); xerbla_("CTPLQT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *m; @@ -906,6 +907,6 @@ static complex c_b2 = {1.f,0.f}; /* End of CTPLQT2 */ - return 0; + return; } /* ctplqt2_ */ diff --git a/lapack-netlib/SRC/ctpmlqt.c b/lapack-netlib/SRC/ctpmlqt.c index 49f185bffe..abd33f4ba7 100644 --- a/lapack-netlib/SRC/ctpmlqt.c +++ b/lapack-netlib/SRC/ctpmlqt.c @@ -706,7 +706,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctpmlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void ctpmlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *mb, complex *v, integer *ldv, complex *t, integer *ldt, complex *a, integer *lda, complex *b, integer *ldb, complex *work, integer *info) @@ -722,7 +722,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, lb, nb, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ctprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ctprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); @@ -793,12 +794,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CTPMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -887,7 +888,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of CTPMLQT */ diff --git a/lapack-netlib/SRC/ctpmqrt.c b/lapack-netlib/SRC/ctpmqrt.c index 7b7390d3ec..a1f04291f6 100644 --- a/lapack-netlib/SRC/ctpmqrt.c +++ b/lapack-netlib/SRC/ctpmqrt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctpmqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void ctpmqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *nb, complex *v, integer *ldv, complex *t, integer *ldt, complex *a, integer *lda, complex *b, integer *ldb, complex *work, integer *info) @@ -740,7 +740,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, lb, mb, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ctprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ctprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); @@ -813,12 +814,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CTPMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -907,7 +908,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of CTPMQRT */ diff --git a/lapack-netlib/SRC/ctpqrt.c b/lapack-netlib/SRC/ctpqrt.c index 45ac1c72cd..63541ba53e 100644 --- a/lapack-netlib/SRC/ctpqrt.c +++ b/lapack-netlib/SRC/ctpqrt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctpqrt_(integer *m, integer *n, integer *l, integer *nb, +/* Subroutine */ void ctpqrt_(integer *m, integer *n, integer *l, integer *nb, complex *a, integer *lda, complex *b, integer *ldb, complex *t, integer *ldt, complex *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, mb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ctprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ctprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *), ctpqrt2_(integer *, integer *, integer *, @@ -758,13 +759,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -798,7 +799,7 @@ f"> */ , &ib); } } - return 0; + return; /* End of CTPQRT */ diff --git a/lapack-netlib/SRC/ctpqrt2.c b/lapack-netlib/SRC/ctpqrt2.c index e11fb09333..39110b7b01 100644 --- a/lapack-netlib/SRC/ctpqrt2.c +++ b/lapack-netlib/SRC/ctpqrt2.c @@ -690,7 +690,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctpqrt2_(integer *m, integer *n, integer *l, complex *a, +/* Subroutine */ void ctpqrt2_(integer *m, integer *n, integer *l, complex *a, integer *lda, complex *b, integer *ldb, complex *t, integer *ldt, integer *info) { @@ -701,16 +701,17 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* Local variables */ integer i__, j, p; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *); integer mp, np; - extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, - integer *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarfg_(integer *, complex *, complex *, + integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -753,13 +754,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("CTPQRT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *n; @@ -873,6 +874,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of CTPQRT2 */ - return 0; + return; } /* ctpqrt2_ */ diff --git a/lapack-netlib/SRC/ctprfb.c b/lapack-netlib/SRC/ctprfb.c index 2aa2c52e99..25a042da77 100644 --- a/lapack-netlib/SRC/ctprfb.c +++ b/lapack-netlib/SRC/ctprfb.c @@ -765,7 +765,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctprfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void ctprfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, complex *v, integer *ldv, complex *t, integer *ldt, complex *a, integer *lda, complex *b, integer *ldb, complex *work, integer *ldwork) @@ -778,12 +778,12 @@ f"> */ /* Local variables */ logical left, backward; integer i__, j; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); logical right; - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer kp, mp, np; @@ -820,7 +820,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0 || *k <= 0 || *l < 0) { - return 0; + return; } if (lsame_(storev, "C")) { @@ -1621,7 +1621,7 @@ f"> */ } - return 0; + return; /* End of CTPRFB */ diff --git a/lapack-netlib/SRC/ctprfb.f b/lapack-netlib/SRC/ctprfb.f index 11496180fc..6cd5f05bd7 100644 --- a/lapack-netlib/SRC/ctprfb.f +++ b/lapack-netlib/SRC/ctprfb.f @@ -1,4 +1,4 @@ -*> \brief \b CTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b CTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/ctprfs.c b/lapack-netlib/SRC/ctprfs.c index 5929dae3ed..f125c87e1c 100644 --- a/lapack-netlib/SRC/ctprfs.c +++ b/lapack-netlib/SRC/ctprfs.c @@ -686,7 +686,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctprfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ctprfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, complex *ap, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -703,12 +703,12 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *), clacn2_( integer *, complex *, complex *, real *, integer *, integer *); integer kc; @@ -772,7 +772,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -784,7 +784,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1118,7 +1118,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of CTPRFS */ diff --git a/lapack-netlib/SRC/ctptri.c b/lapack-netlib/SRC/ctptri.c index 252f5aba7b..c24e75804a 100644 --- a/lapack-netlib/SRC/ctptri.c +++ b/lapack-netlib/SRC/ctptri.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctptri_(char *uplo, char *diag, integer *n, complex *ap, +/* Subroutine */ void ctptri_(char *uplo, char *diag, integer *n, complex *ap, integer *info) { /* System generated locals */ @@ -640,10 +640,10 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpmv_(char *, char *, char *, integer *, complex *, complex *, integer *); logical upper; integer jc, jj; @@ -681,7 +681,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Check for singularity if non-unit. */ @@ -694,7 +694,7 @@ f"> */ jj += *info; i__2 = jj; if (ap[i__2].r == 0.f && ap[i__2].i == 0.f) { - return 0; + return; } /* L10: */ } @@ -704,7 +704,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = jj; if (ap[i__2].r == 0.f && ap[i__2].i == 0.f) { - return 0; + return; } jj = jj + *n - *info + 1; /* L20: */ @@ -776,7 +776,7 @@ f"> */ } } - return 0; + return; /* End of CTPTRI */ diff --git a/lapack-netlib/SRC/ctptrs.c b/lapack-netlib/SRC/ctptrs.c index 5b01b69859..fd1d988043 100644 --- a/lapack-netlib/SRC/ctptrs.c +++ b/lapack-netlib/SRC/ctptrs.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctptrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ctptrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, complex *ap, complex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -653,7 +653,7 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ctpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctpsv_(char *, char *, char *, integer *, complex *, complex *, integer *); integer jc; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -698,13 +698,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -716,7 +716,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = jc + *info - 1; if (ap[i__2].r == 0.f && ap[i__2].i == 0.f) { - return 0; + return; } jc += *info; /* L10: */ @@ -727,7 +727,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = jc; if (ap[i__2].r == 0.f && ap[i__2].i == 0.f) { - return 0; + return; } jc = jc + *n - *info + 1; /* L20: */ @@ -744,7 +744,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of CTPTRS */ diff --git a/lapack-netlib/SRC/ctpttf.c b/lapack-netlib/SRC/ctpttf.c index abf4d40289..6672d195ce 100644 --- a/lapack-netlib/SRC/ctpttf.c +++ b/lapack-netlib/SRC/ctpttf.c @@ -718,7 +718,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctpttf_(char *transr, char *uplo, integer *n, complex * +/* Subroutine */ void ctpttf_(char *transr, char *uplo, integer *n, complex * ap, complex *arf, integer *info) { /* System generated locals */ @@ -760,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -776,7 +776,7 @@ f"> */ r_cnjg(&q__1, ap); arf[0].r = q__1.r, arf[0].i = q__1.i; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1112,7 +1112,7 @@ f"> */ } - return 0; + return; /* End of CTPTTF */ diff --git a/lapack-netlib/SRC/ctpttr.c b/lapack-netlib/SRC/ctpttr.c index e35c380901..b911d65f7e 100644 --- a/lapack-netlib/SRC/ctpttr.c +++ b/lapack-netlib/SRC/ctpttr.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctpttr_(char *uplo, integer *n, complex *ap, complex *a, +/* Subroutine */ void ctpttr_(char *uplo, integer *n, complex *ap, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTPTTR", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -687,7 +687,7 @@ f"> */ } - return 0; + return; /* End of CTPTTR */ diff --git a/lapack-netlib/SRC/ctrcon.c b/lapack-netlib/SRC/ctrcon.c index bd49bcfe32..b21fdbcb4d 100644 --- a/lapack-netlib/SRC/ctrcon.c +++ b/lapack-netlib/SRC/ctrcon.c @@ -649,7 +649,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void ctrcon_(char *norm, char *uplo, char *diag, integer *n, complex *a, integer *lda, real *rcond, complex *work, real *rwork, integer *info) { @@ -664,7 +664,7 @@ f"> */ integer isave[3]; real anorm; logical upper; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xnorm; integer ix; @@ -674,7 +674,7 @@ f"> */ extern real clantr_(char *, char *, char *, integer *, integer *, complex *, integer *, real *); real ainvnm; - extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real *, real *, integer *), csrscl_(integer *, real *, complex *, integer *); @@ -722,14 +722,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.f; - return 0; + return; } *rcond = 0.f; @@ -794,7 +794,7 @@ f"> */ } L20: - return 0; + return; /* End of CTRCON */ diff --git a/lapack-netlib/SRC/ctrevc.c b/lapack-netlib/SRC/ctrevc.c index 44a8b707d4..72d6c461c7 100644 --- a/lapack-netlib/SRC/ctrevc.c +++ b/lapack-netlib/SRC/ctrevc.c @@ -731,7 +731,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctrevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void ctrevc_(char *side, char *howmny, logical *select, integer *n, complex *t, integer *ldt, complex *vl, integer *ldvl, complex *vr, integer *ldvr, integer *mm, integer *m, complex *work, real *rwork, integer *info) @@ -749,20 +749,22 @@ f"> */ integer i__, j, k; real scale; extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real remax; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical leftv, bothv, somev; integer ii, ki; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer is; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen), clatrs_(char *, char *, + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real * , real *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -839,13 +841,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTREVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set the constants to control overflow. */ @@ -1099,7 +1101,7 @@ f"> */ } } - return 0; + return; /* End of CTREVC */ diff --git a/lapack-netlib/SRC/ctrevc3.c b/lapack-netlib/SRC/ctrevc3.c index c83a285535..5c71dfb218 100644 --- a/lapack-netlib/SRC/ctrevc3.c +++ b/lapack-netlib/SRC/ctrevc3.c @@ -762,7 +762,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctrevc3_(char *side, char *howmny, logical *select, +/* Subroutine */ void ctrevc3_(char *side, char *howmny, logical *select, integer *n, complex *t, integer *ldt, complex *vl, integer *ldvl, complex *vr, integer *ldvr, integer *mm, integer *m, complex *work, integer *lwork, real *rwork, integer *lrwork, integer *info) @@ -781,30 +781,30 @@ static integer c__2 = 2; logical over; integer i__, j, k; real scale; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); real remax; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); logical leftv, bothv, somev; integer nb, ii, ki; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer is, iv; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clacpy_(char *, integer *, integer - *, complex *, integer *, complex *, integer *), xerbla_( - char *, integer *, ftnlen); + *, complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real *, real *, integer *); extern real scasum_(integer *, complex *, integer *); @@ -902,15 +902,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("CTREVC3", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Use blocked version of back-transformation if sufficient workspace. */ @@ -1280,7 +1280,7 @@ static integer c__2 = 2; } } - return 0; + return; /* End of CTREVC3 */ diff --git a/lapack-netlib/SRC/ctrexc.c b/lapack-netlib/SRC/ctrexc.c index 4effc2b9a6..c940943492 100644 --- a/lapack-netlib/SRC/ctrexc.c +++ b/lapack-netlib/SRC/ctrexc.c @@ -639,7 +639,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrexc_(char *compq, integer *n, complex *t, integer * +/* Subroutine */ void ctrexc_(char *compq, integer *n, complex *t, integer * ldt, complex *q, integer *ldq, integer *ifst, integer *ilst, integer * info) { @@ -649,7 +649,7 @@ f"> */ /* Local variables */ complex temp; - extern /* Subroutine */ int crot_(integer *, complex *, integer *, + extern /* Subroutine */ void crot_(integer *, complex *, integer *, complex *, integer *, real *, complex *); integer k; extern logical lsame_(char *, char *); @@ -657,8 +657,9 @@ f"> */ integer m1, m2, m3; real cs; complex t11, t22, sn; - extern /* Subroutine */ int clartg_(complex *, complex *, real *, complex - *, complex *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clartg_(complex *, complex *, real *, complex + *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -699,13 +700,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTREXC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1 || *ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -769,7 +770,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of CTREXC */ diff --git a/lapack-netlib/SRC/ctrrfs.c b/lapack-netlib/SRC/ctrrfs.c index ce157db1ed..a672a1647f 100644 --- a/lapack-netlib/SRC/ctrrfs.c +++ b/lapack-netlib/SRC/ctrrfs.c @@ -694,7 +694,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ctrrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex *x, integer *ldx, real *ferr, real *berr, complex *work, real *rwork, integer *info) @@ -712,11 +712,11 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrmv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *), ctrsv_(char *, char *, char *, integer *, complex *, integer *, complex *, integer *), clacn2_( integer *, complex *, complex *, real *, integer *, integer *); @@ -784,7 +784,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -796,7 +796,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1124,7 +1124,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of CTRRFS */ diff --git a/lapack-netlib/SRC/ctrsen.c b/lapack-netlib/SRC/ctrsen.c index 29509eb3db..ca1dc34914 100644 --- a/lapack-netlib/SRC/ctrsen.c +++ b/lapack-netlib/SRC/ctrsen.c @@ -776,7 +776,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctrsen_(char *job, char *compq, logical *select, integer +/* Subroutine */ void ctrsen_(char *job, char *compq, logical *select, integer *n, complex *t, integer *ldt, complex *q, integer *ldq, complex *w, integer *m, real *s, real *sep, complex *work, integer *lwork, integer *info) @@ -792,20 +792,20 @@ f"> */ logical wantq, wants; real rnorm; integer n1, n2; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real rwork[1]; extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer nn, ks; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical wantbh; - extern /* Subroutine */ int ctrexc_(char *, integer *, complex *, integer + extern /* Subroutine */ void ctrexc_(char *, integer *, complex *, integer *, complex *, integer *, integer *, integer *, integer *); logical wantsp; - extern /* Subroutine */ int ctrsyl_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ctrsyl_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *); logical lquery; @@ -889,9 +889,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -989,7 +989,7 @@ f"> */ work[1].r = (real) lwmin, work[1].i = 0.f; - return 0; + return; /* End of CTRSEN */ diff --git a/lapack-netlib/SRC/ctrsna.c b/lapack-netlib/SRC/ctrsna.c index 415f81fd3a..763072c35d 100644 --- a/lapack-netlib/SRC/ctrsna.c +++ b/lapack-netlib/SRC/ctrsna.c @@ -760,7 +760,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctrsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void ctrsna_(char *job, char *howmny, logical *select, integer *n, complex *t, integer *ldt, complex *vl, integer *ldvl, complex *vr, integer *ldvr, real *s, real *sep, integer *mm, integer * m, complex *work, integer *ldwork, real *rwork, integer *info) @@ -783,20 +783,20 @@ f"> */ integer isave[3]; complex dummy[1]; logical wants; - extern /* Subroutine */ int clacn2_(integer *, complex *, complex *, real + extern /* Subroutine */ void clacn2_(integer *, complex *, complex *, real *, integer *, integer *); real xnorm; extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ks, ix; extern integer icamax_(integer *, complex *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; logical wantbh; - extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void clatrs_(char *, char *, char *, char *, integer *, complex *, integer *, complex *, real *, real *, integer *), csrscl_(integer *, real *, complex *, integer *), ctrexc_(char *, integer *, complex @@ -882,19 +882,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRSNA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (somcon) { if (! select[1]) { - return 0; + return; } } if (wants) { @@ -903,7 +903,7 @@ f"> */ if (wantsp) { sep[1] = c_abs(&t[t_dim1 + 1]); } - return 0; + return; } /* Get machine constants */ @@ -1020,7 +1020,7 @@ f"> */ L50: ; } - return 0; + return; /* End of CTRSNA */ diff --git a/lapack-netlib/SRC/ctrsyl.c b/lapack-netlib/SRC/ctrsyl.c index d472ac0fb2..1987a27cb5 100644 --- a/lapack-netlib/SRC/ctrsyl.c +++ b/lapack-netlib/SRC/ctrsyl.c @@ -669,7 +669,7 @@ f"> */ /* > \ingroup complexSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrsyl_(char *trana, char *tranb, integer *isgn, integer +/* Subroutine */ void ctrsyl_(char *trana, char *tranb, integer *isgn, integer *m, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *c__, integer *ldc, real *scale, integer *info) { @@ -690,15 +690,16 @@ f"> */ *, complex *, integer *); complex a11; real db; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); extern real clange_(char *, integer *, integer *, complex *, integer *, real *); complex x11; extern /* Complex */ VOID cladiv_(complex *, complex *, complex *); real scaloc; extern real slamch_(char *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer - *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; logical notrna, notrnb; real smlnum, da11; @@ -753,14 +754,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRSYL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *scale = 1.f; if (*m == 0 || *n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -1100,7 +1101,7 @@ f"> */ } - return 0; + return; /* End of CTRSYL */ diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c new file mode 100644 index 0000000000..d1ee7aa163 --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.c @@ -0,0 +1,2022 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b CTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > CTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complexSYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void ctrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, complex *a, integer *lda, complex *b, integer + *ldb, complex *c__, integer *ldc, real *scale, real *swork, integer * + ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3, r__4; + complex q__1; + + /* Local variables */ + real scal; + complex csgn; + real anrm, bnrm, cnrm; + integer awrk, bwrk; + real *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, + integer *, complex *, complex *, integer *, complex *, integer *, + complex *, complex *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(real *); + integer nb, jj, ll; + extern real clange_(char *, integer *, integer *, complex *, integer *, + real *); + extern /* Subroutine */ void clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *); + real scaloc; + extern real slamch_(char *); + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer + *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + extern /* Subroutine */ void ctrsyl_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *, real *, integer *); + logical lquery; + integer nba, nbb; + real buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "CTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CTRSYL3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return; + } + + wnrm = (real*)malloc(f2cmax(*m,*n)*sizeof(real)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ctrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return; + } + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = clange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = clange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + q__1.r = sgn, q__1.i = 0.f; + csgn.r = q__1.r, csgn.i = q__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = clange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = clange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + csscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__4, &i__5, &i__6, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = clange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + csscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "N", &i__4, &i__5, &i__6, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = clange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + csscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("C", "N", &i__3, &i__4, &i__5, &q__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ctrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = clange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + q__1.r = -1.f, q__1.i = 0.f; + cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = clange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + q__1.r = -csgn.r, q__1.i = -csgn.i; + cgemm_("N", "C", &i__2, &i__3, &i__4, &q__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + r__3 = (r__1 = c__[i__1].r, abs(r__1)), r__4 = (r__2 = r_imag(&c__[ + c_dim1 + 1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + r__3 = scal, r__4 = (r__1 = c__[i__3].r, abs(r__1)), r__3 = + f2cmax(r__3,r__4), r__4 = (r__2 = r_imag(&c__[k + l * + c_dim1]), abs(r__2)); + scal = f2cmax(r__3,r__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + clascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return; + +/* End of CTRSYL3 */ + +} /* ctrsyl3_ */ + diff --git a/lapack-netlib/SRC/ctrsyl3.f b/lapack-netlib/SRC/ctrsyl3.f new file mode 100644 index 0000000000..586dc0207f --- /dev/null +++ b/lapack-netlib/SRC/ctrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b CTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> CTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complexSYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + REAL SCALE +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) + REAL SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX CSGN +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL CLANGE, SLAMCH, SLARMM + EXTERNAL CLANGE, ILAENV, LSAME, SLAMCH, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL CSSCAL, CGEMM, CLASCL, CTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'CTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = CLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = CLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) + CSGN = CMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL.NE.ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( REAL( C( 1, 1 ) ) ), + $ ABS( AIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( REAL ( C( K, L ) ) ), + $ ABS( AIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL CLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of CTRSYL3 +* + END diff --git a/lapack-netlib/SRC/ctrti2.c b/lapack-netlib/SRC/ctrti2.c index e15206f640..6622f870f7 100644 --- a/lapack-netlib/SRC/ctrti2.c +++ b/lapack-netlib/SRC/ctrti2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrti2_(char *uplo, char *diag, integer *n, complex *a, +/* Subroutine */ void ctrti2_(char *uplo, char *diag, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -633,12 +633,13 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ctrmv_(char *, char *, char *, integer *, - complex *, integer *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ctrmv_(char *, char *, char *, integer *, + complex *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; complex ajj; @@ -675,7 +676,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRTI2", &i__1, (ftnlen)6); - return 0; + return; } if (upper) { @@ -735,7 +736,7 @@ f"> */ } } - return 0; + return; /* End of CTRTI2 */ diff --git a/lapack-netlib/SRC/ctrtri.c b/lapack-netlib/SRC/ctrtri.c index f58ea37070..7b4b9191c2 100644 --- a/lapack-netlib/SRC/ctrtri.c +++ b/lapack-netlib/SRC/ctrtri.c @@ -625,7 +625,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrtri_(char *uplo, char *diag, integer *n, complex *a, +/* Subroutine */ void ctrtri_(char *uplo, char *diag, integer *n, complex *a, integer *lda, integer *info) { /* System generated locals */ @@ -637,13 +637,13 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); logical upper; - extern /* Subroutine */ int ctrti2_(char *, char *, integer *, complex *, + extern /* Subroutine */ void ctrti2_(char *, char *, integer *, complex *, integer *, integer *); integer jb, nb, nn; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -684,13 +684,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity if non-unit. */ @@ -700,7 +700,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (a[i__2].r == 0.f && a[i__2].i == 0.f) { - return 0; + return; } /* L10: */ } @@ -784,7 +784,7 @@ f"> */ } } - return 0; + return; /* End of CTRTRI */ diff --git a/lapack-netlib/SRC/ctrtrs.c b/lapack-netlib/SRC/ctrtrs.c index cab63a1871..024369220f 100644 --- a/lapack-netlib/SRC/ctrtrs.c +++ b/lapack-netlib/SRC/ctrtrs.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ctrtrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, complex *a, integer *lda, complex *b, integer *ldb, integer *info) { @@ -661,10 +661,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, - integer *), xerbla_(char *, - integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; @@ -709,13 +709,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -725,7 +725,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (a[i__2].r == 0.f && a[i__2].i == 0.f) { - return 0; + return; } /* L10: */ } @@ -737,7 +737,7 @@ f"> */ ctrsm_("Left", uplo, trans, diag, n, nrhs, &c_b2, &a[a_offset], lda, &b[ b_offset], ldb); - return 0; + return; /* End of CTRTRS */ diff --git a/lapack-netlib/SRC/ctrttf.c b/lapack-netlib/SRC/ctrttf.c index 7ca85eabc5..0f735e92e0 100644 --- a/lapack-netlib/SRC/ctrttf.c +++ b/lapack-netlib/SRC/ctrttf.c @@ -726,7 +726,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctrttf_(char *transr, char *uplo, integer *n, complex *a, +/* Subroutine */ void ctrttf_(char *transr, char *uplo, integer *n, complex *a, integer *lda, complex *arf, integer *info) { /* System generated locals */ @@ -776,7 +776,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -790,7 +790,7 @@ f"> */ arf[0].r = q__1.r, arf[0].i = q__1.i; } } - return 0; + return; } /* Size of array ARF(1:2,0:nt-1) */ @@ -1122,7 +1122,7 @@ f"> */ } - return 0; + return; /* End of CTRTTF */ diff --git a/lapack-netlib/SRC/ctrttp.c b/lapack-netlib/SRC/ctrttp.c index ea2453593d..b6487a9438 100644 --- a/lapack-netlib/SRC/ctrttp.c +++ b/lapack-netlib/SRC/ctrttp.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ctrttp_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void ctrttp_(char *uplo, integer *n, complex *a, integer *lda, complex *ap, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTRTTP", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -687,7 +687,7 @@ f"> */ } - return 0; + return; /* End of CTRTTP */ diff --git a/lapack-netlib/SRC/ctzrzf.c b/lapack-netlib/SRC/ctzrzf.c index 4be059dc4f..89e13e78ce 100644 --- a/lapack-netlib/SRC/ctzrzf.c +++ b/lapack-netlib/SRC/ctzrzf.c @@ -667,7 +667,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ctzrzf_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void ctzrzf_(integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -675,13 +675,14 @@ f"> */ /* Local variables */ integer i__, nbmin, m1, ib, nb, ki, kk, mu, nx; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), clarzb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clarzb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int clarzt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void clarzt_(char *, char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *), clatrz_(integer *, integer *, integer *, complex *, integer *, complex *, complex *); integer lwkmin, ldwork, lwkopt; @@ -741,15 +742,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CTZRZF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -757,7 +758,7 @@ f"> */ tau[i__2].r = 0.f, tau[i__2].i = 0.f; /* L10: */ } - return 0; + return; } nbmin = 2; @@ -855,7 +856,7 @@ f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CTZRZF */ diff --git a/lapack-netlib/SRC/cunbdb.c b/lapack-netlib/SRC/cunbdb.c index 853ffc0dab..0e4473348a 100644 --- a/lapack-netlib/SRC/cunbdb.c +++ b/lapack-netlib/SRC/cunbdb.c @@ -798,7 +798,7 @@ f"> */ /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunbdb_(char *trans, char *signs, integer *m, integer *p, +/* Subroutine */ void cunbdb_(char *trans, char *signs, integer *m, integer *p, integer *q, complex *x11, integer *ldx11, complex *x12, integer * ldx12, complex *x21, integer *ldx21, complex *x22, integer *ldx22, real *theta, real *phi, complex *taup1, complex *taup2, complex * @@ -813,18 +813,18 @@ f"> */ /* Local variables */ logical colmajor; integer lworkmin, lworkopt, i__; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, + extern /* Subroutine */ void caxpy_(integer *, complex *, complex *, integer *, complex *, integer *); real z1, z2, z3, z4; extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int clarfgp_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfgp_(integer *, complex *, complex *, integer *, complex *); @@ -930,9 +930,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("xORBDB", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Handle column-major and row-major separately */ @@ -1421,7 +1421,7 @@ f"> */ } - return 0; + return; /* End of CUNBDB */ diff --git a/lapack-netlib/SRC/cunbdb1.c b/lapack-netlib/SRC/cunbdb1.c index c294022f37..124443921a 100644 --- a/lapack-netlib/SRC/cunbdb1.c +++ b/lapack-netlib/SRC/cunbdb1.c @@ -713,7 +713,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunbdb1_(integer *m, integer *p, integer *q, complex * +/* Subroutine */ void cunbdb1_(integer *m, integer *p, integer *q, complex * x11, integer *ldx11, complex *x21, integer *ldx21, real *theta, real * phi, complex *taup1, complex *taup2, complex *tauq1, complex *work, integer *lwork, integer *info) @@ -729,20 +729,20 @@ static integer c__1 = 1; real c__; integer i__; real s; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); integer ilarf, llarf, childinfo; - extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + extern /* Subroutine */ void csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int cunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void cunbdb5_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int clarfgp_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfgp_(integer *, complex *, complex *, integer *, complex *); @@ -813,9 +813,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CUNBDB1", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., Q of X11 and X21 */ @@ -895,7 +895,7 @@ static integer c__1 = 1; } - return 0; + return; /* End of CUNBDB1 */ diff --git a/lapack-netlib/SRC/cunbdb2.c b/lapack-netlib/SRC/cunbdb2.c index 878b1a624b..982e05efac 100644 --- a/lapack-netlib/SRC/cunbdb2.c +++ b/lapack-netlib/SRC/cunbdb2.c @@ -715,7 +715,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunbdb2_(integer *m, integer *p, integer *q, complex * +/* Subroutine */ void cunbdb2_(integer *m, integer *p, integer *q, complex * x11, integer *ldx11, complex *x21, integer *ldx21, real *theta, real * phi, complex *taup1, complex *taup2, complex *tauq1, complex *work, integer *lwork, integer *info) @@ -731,21 +731,21 @@ static integer c__1 = 1; real c__; integer i__; real s; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); integer ilarf, llarf, childinfo; - extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + extern /* Subroutine */ void csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int cunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void cunbdb5_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int clarfgp_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfgp_(integer *, complex *, complex *, integer *, complex *); @@ -815,9 +815,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CUNBDB2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., P of X11 and X21 */ @@ -913,7 +913,7 @@ static integer c__1 = 1; x21[i__ + (i__ + 1) * x21_dim1], ldx21, &work[ilarf]); } - return 0; + return; /* End of CUNBDB2 */ diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f index db238f9256..b45db61003 100644 --- a/lapack-netlib/SRC/cunbdb2.f +++ b/lapack-netlib/SRC/cunbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX array, dimension (P) +*> TAUP1 is COMPLEX array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX array, dimension (M-P) +*> TAUP2 is COMPLEX array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/cunbdb3.c b/lapack-netlib/SRC/cunbdb3.c index 0718d7c0cd..ebff11d710 100644 --- a/lapack-netlib/SRC/cunbdb3.c +++ b/lapack-netlib/SRC/cunbdb3.c @@ -713,7 +713,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunbdb3_(integer *m, integer *p, integer *q, complex * +/* Subroutine */ void cunbdb3_(integer *m, integer *p, integer *q, complex * x11, integer *ldx11, complex *x21, integer *ldx21, real *theta, real * phi, complex *taup1, complex *taup2, complex *tauq1, complex *work, integer *lwork, integer *info) @@ -729,20 +729,20 @@ static integer c__1 = 1; real c__; integer i__; real s; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); integer ilarf, llarf, childinfo; - extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + extern /* Subroutine */ void csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int cunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void cunbdb5_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int clarfgp_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfgp_(integer *, complex *, complex *, integer *, complex *); @@ -812,9 +812,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CUNBDB3", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., M-P of X11 and X21 */ @@ -909,7 +909,7 @@ static integer c__1 = 1; x11[i__ + (i__ + 1) * x11_dim1], ldx11, &work[ilarf]); } - return 0; + return; /* End of CUNBDB3 */ diff --git a/lapack-netlib/SRC/cunbdb4.c b/lapack-netlib/SRC/cunbdb4.c index 045e3e6262..d9e1e3df95 100644 --- a/lapack-netlib/SRC/cunbdb4.c +++ b/lapack-netlib/SRC/cunbdb4.c @@ -724,7 +724,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunbdb4_(integer *m, integer *p, integer *q, complex * +/* Subroutine */ void cunbdb4_(integer *m, integer *p, integer *q, complex * x11, integer *ldx11, complex *x21, integer *ldx21, real *theta, real * phi, complex *taup1, complex *taup2, complex *tauq1, complex *phantom, complex *work, integer *lwork, integer *info) @@ -740,21 +740,21 @@ static integer c__1 = 1; real c__; integer i__, j; real s; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); integer ilarf, llarf, childinfo; - extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + extern /* Subroutine */ void csrot_(integer *, complex *, integer *, complex *, integer *, real *, real *); extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int cunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void cunbdb5_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int clarfgp_(integer *, complex *, complex *, + extern /* Subroutine */ void clarfgp_(integer *, complex *, complex *, integer *, complex *); @@ -827,9 +827,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CUNBDB4", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., M-Q of X11 and X21 */ @@ -980,7 +980,7 @@ static integer c__1 = 1; clacgv_(&i__2, &x21[*m - *q + i__ - *p + i__ * x21_dim1], ldx21); } - return 0; + return; /* End of CUNBDB4 */ diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f index e6afd89c30..117f23d08d 100644 --- a/lapack-netlib/SRC/cunbdb4.f +++ b/lapack-netlib/SRC/cunbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX array, dimension (P) +*> TAUP1 is COMPLEX array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX array, dimension (M-P) +*> TAUP2 is COMPLEX array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/cunbdb5.c b/lapack-netlib/SRC/cunbdb5.c index a6cb9b4e02..2ab0644ef3 100644 --- a/lapack-netlib/SRC/cunbdb5.c +++ b/lapack-netlib/SRC/cunbdb5.c @@ -664,7 +664,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunbdb5_(integer *m1, integer *m2, integer *n, complex * +/* Subroutine */ void cunbdb5_(integer *m1, integer *m2, integer *n, complex * x1, integer *incx1, complex *x2, integer *incx2, complex *q1, integer *ldq1, complex *q2, integer *ldq2, complex *work, integer *lwork, integer *info) @@ -676,7 +676,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ integer i__, j, childinfo; extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cunbdb6_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cunbdb6_( integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, integer *); @@ -727,7 +728,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("CUNBDB5", &i__1, (ftnlen)7); - return 0; + return; } /* Project X onto the orthogonal complement of Q */ @@ -740,7 +741,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ r__1 = scnrm2_(m1, &x1[1], incx1); r__2 = scnrm2_(m2, &x2[1], incx2); if (r__1 != 0.f || r__2 != 0.f) { - return 0; + return; } /* Project each standard basis vector e_1,...,e_M1 in turn, stopping */ @@ -765,7 +766,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ r__1 = scnrm2_(m1, &x1[1], incx1); r__2 = scnrm2_(m2, &x2[1], incx2); if (r__1 != 0.f || r__2 != 0.f) { - return 0; + return; } } @@ -791,11 +792,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ r__1 = scnrm2_(m1, &x1[1], incx1); r__2 = scnrm2_(m2, &x2[1], incx2); if (r__1 != 0.f || r__2 != 0.f) { - return 0; + return; } } - return 0; + return; /* End of CUNBDB5 */ diff --git a/lapack-netlib/SRC/cunbdb6.c b/lapack-netlib/SRC/cunbdb6.c index 06d76579d6..6023d70d49 100644 --- a/lapack-netlib/SRC/cunbdb6.c +++ b/lapack-netlib/SRC/cunbdb6.c @@ -669,7 +669,7 @@ static integer c__1 = 1; /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunbdb6_(integer *m1, integer *m2, integer *n, complex * +/* Subroutine */ void cunbdb6_(integer *m1, integer *m2, integer *n, complex * x1, integer *incx1, complex *x2, integer *incx2, complex *q1, integer *ldq1, complex *q2, integer *ldq2, complex *work, integer *lwork, integer *info) @@ -680,9 +680,11 @@ static integer c__1 = 1; /* Local variables */ integer i__; - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * - , integer *), xerbla_(char *, integer *, ftnlen), classq_( + , integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void classq_( integer *, complex *, integer *, real *, real *); real normsq1, normsq2, scl1, scl2, ssq1, ssq2; @@ -732,7 +734,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("CUNBDB6", &i__1, (ftnlen)7); - return 0; + return; } /* First, project X onto the orthogonal complement of Q's column */ @@ -786,11 +788,11 @@ static integer c__1 = 1; /* Otherwise, project again. */ if (normsq2 >= normsq1 * .01f) { - return 0; + return; } if (normsq2 == 0.f) { - return 0; + return; } normsq1 = normsq2; @@ -849,7 +851,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of CUNBDB6 */ diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f index 7acc99cb8b..b93a389d6b 100644 --- a/lapack-netlib/SRC/cunbdb6.f +++ b/lapack-netlib/SRC/cunbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,16 +173,19 @@ SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * ===================================================================== * * .. Parameters .. - REAL ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01E0, REALONE = 1.0E0, + REAL ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) COMPLEX NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0), $ ZERO = (0.0E0,0.0E0) ) * .. * .. Local Scalars .. - INTEGER I - REAL NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + REAL EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + REAL SLAMCH * .. * .. External Subroutines .. EXTERNAL CGEMV, CLASSQ, XERBLA @@ -211,17 +220,17 @@ SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL XERBLA( 'CUNBDB6', -INFO ) RETURN END IF +* + EPS = SLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -239,27 +248,31 @@ SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -281,24 +294,22 @@ SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL CLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL CLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -307,4 +318,3 @@ SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * End of CUNBDB6 * END - diff --git a/lapack-netlib/SRC/cuncsd.c b/lapack-netlib/SRC/cuncsd.c index d031815938..30b37d506a 100644 --- a/lapack-netlib/SRC/cuncsd.c +++ b/lapack-netlib/SRC/cuncsd.c @@ -829,7 +829,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cuncsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void cuncsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, char *signs, integer *m, integer *p, integer *q, complex *x11, integer *ldx11, complex *x12, integer *ldx12, complex * x21, integer *ldx21, complex *x22, integer *ldx22, real *theta, @@ -853,27 +853,28 @@ f"> */ integer childinfo, p1, q1, lbbcsdworkmin, itaup1, itaup2, itauq1, itauq2, lorbdbworkmin, lrworkmin, lbbcsdworkopt; logical wantu1, wantu2; - extern /* Subroutine */ int cbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void cbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, real *, real *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, real *, real *, real *, real *, real *, real *, real *, integer *, integer *); integer lrworkopt, ibbcsd, lorbdbworkopt; - extern /* Subroutine */ int cunbdb_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunbdb_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, complex *, complex *, integer *, integer *); integer iorbdb, lorglqworkmin, lorgqrworkmin; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen), clapmr_(logical *, integer *, integer *, + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clapmr_(logical *, integer *, integer *, complex *, integer *, integer *), clapmt_(logical *, integer *, integer *, complex *, integer *, integer *); integer lorglqworkopt; - extern /* Subroutine */ int cunglq_(integer *, integer *, integer *, + extern /* Subroutine */ void cunglq_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer lorgqrworkopt, iorglq; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer iorgqr; char signst[1], transt[1]; @@ -1005,7 +1006,7 @@ f"> */ ldv1t, &v2t[v2t_offset], ldv2t, &u1[u1_offset], ldu1, &u2[ u2_offset], ldu2, &work[1], lwork, &rwork[1], lrwork, &iwork[ 1], info); - return 0; + return; } /* Work with permutation [ 0 I; I 0 ] * X * [ 0 I; I 0 ] if */ @@ -1025,7 +1026,7 @@ f"> */ u2_offset], ldu2, &u1[u1_offset], ldu1, &v2t[v2t_offset], ldv2t, &v1t[v1t_offset], ldv1t, &work[1], lwork, &rwork[1], lrwork, &iwork[1], info); - return 0; + return; } /* Compute workspace */ @@ -1141,9 +1142,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery || lrquery) { - return 0; + return; } /* Transform to bidiagonal block form */ @@ -1322,7 +1323,7 @@ f"> */ } } - return 0; + return; /* End CUNCSD */ diff --git a/lapack-netlib/SRC/cuncsd2by1.c b/lapack-netlib/SRC/cuncsd2by1.c index a4dbb3cc11..8d4dec5215 100644 --- a/lapack-netlib/SRC/cuncsd2by1.c +++ b/lapack-netlib/SRC/cuncsd2by1.c @@ -767,7 +767,7 @@ by1.f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cuncsd2by1_(char *jobu1, char *jobu2, char *jobv1t, +/* Subroutine */ void cuncsd2by1_(char *jobu1, char *jobu2, char *jobv1t, integer *m, integer *p, integer *q, complex *x11, integer *ldx11, complex *x21, integer *ldx21, real *theta, complex *u1, integer *ldu1, complex *u2, integer *ldu2, complex *v1t, integer *ldv1t, complex * @@ -783,36 +783,37 @@ by1.f"> */ complex cdum[1] /* was [1][1] */; integer iphi, lworkmin, lworkopt, i__, j, r__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer childinfo, lorglqmin, lorgqrmin, lorglqopt, lrworkmin, itaup1, itaup2, itauq1, lorgqropt; logical wantu1, wantu2; - extern /* Subroutine */ int cbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void cbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, real *, real *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, real *, real *, real *, real *, real *, real *, real *, real *, real *, integer *, integer *); integer lrworkopt, ibbcsd, lbbcsd, iorbdb, lorbdb; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen), clapmr_(logical *, integer *, integer *, + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clapmr_(logical *, integer *, integer *, complex *, integer *, integer *), clapmt_(logical *, integer *, integer *, complex *, integer *, integer *), cunglq_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer iorglq; - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer lorglq, iorgqr, lorgqr; - extern /* Subroutine */ int cunbdb1_(integer *, integer *, integer *, + extern /* Subroutine */ void cunbdb1_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, complex *, integer *, integer *), cunbdb2_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, complex *, integer *, integer *); logical lquery; - extern /* Subroutine */ int cunbdb3_(integer *, integer *, integer *, + extern /* Subroutine */ void cunbdb3_(integer *, integer *, integer *, complex *, integer *, complex *, integer *, real *, real *, complex *, complex *, complex *, complex *, integer *, integer *), cunbdb4_(integer *, integer *, integer *, complex *, integer *, @@ -1147,9 +1148,9 @@ by1.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNCSD2BY1", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { - return 0; + return; } lorgqr = *lwork - iorgqr + 1; lorglq = *lwork - iorglq + 1; @@ -1473,7 +1474,7 @@ by1.f"> */ } } - return 0; + return; /* End of CUNCSD2BY1 */ diff --git a/lapack-netlib/SRC/cung2l.c b/lapack-netlib/SRC/cung2l.c index 1e03d57859..ab80fd3f81 100644 --- a/lapack-netlib/SRC/cung2l.c +++ b/lapack-netlib/SRC/cung2l.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cung2l_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cung2l_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -637,7 +637,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); integer ii; @@ -676,13 +676,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNG2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns 1:n-k to columns of the unit matrix */ @@ -731,7 +731,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of CUNG2L */ diff --git a/lapack-netlib/SRC/cung2r.c b/lapack-netlib/SRC/cung2r.c index 20dd519547..d88e9285bc 100644 --- a/lapack-netlib/SRC/cung2r.c +++ b/lapack-netlib/SRC/cung2r.c @@ -627,7 +627,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cung2r_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cung2r_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -636,10 +636,10 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, - integer *, complex *, complex *, integer *, complex *), - xerbla_(char *, integer *, ftnlen); + integer *, complex *, complex *, integer *, complex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNG2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns k+1:n to columns of the unit matrix */ @@ -731,7 +731,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of CUNG2R */ diff --git a/lapack-netlib/SRC/cungbr.c b/lapack-netlib/SRC/cungbr.c index f0044e76c7..f25d67f48c 100644 --- a/lapack-netlib/SRC/cungbr.c +++ b/lapack-netlib/SRC/cungbr.c @@ -670,7 +670,7 @@ f"> */ /* > \ingroup complexGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungbr_(char *vect, integer *m, integer *n, integer *k, +/* Subroutine */ void cungbr_(char *vect, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { @@ -683,7 +683,8 @@ f"> */ integer iinfo; logical wantq; integer mn; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), cunglq_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void cunglq_( integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, @@ -766,17 +767,17 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (wantq) { @@ -883,7 +884,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNGBR */ diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index c973d0b0a7..a31a53d790 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -233,7 +233,7 @@ SUBROUTINE CUNGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF END IF END IF - LWKOPT = REAL( WORK( 1 ) ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/cunghr.c b/lapack-netlib/SRC/cunghr.c index c99c4b0538..e4c398d94b 100644 --- a/lapack-netlib/SRC/cunghr.c +++ b/lapack-netlib/SRC/cunghr.c @@ -640,7 +640,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunghr_(integer *n, integer *ilo, integer *ihi, complex * +/* Subroutine */ void cunghr_(integer *n, integer *ilo, integer *ihi, complex * a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { @@ -652,7 +652,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); integer lwkopt; logical lquery; @@ -702,16 +702,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGHR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Shift the vectors which define the elementary reflectors one */ @@ -774,7 +774,7 @@ f"> */ ilo], &work[1], lwork, &iinfo); } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNGHR */ diff --git a/lapack-netlib/SRC/cungl2.c b/lapack-netlib/SRC/cungl2.c index 1fb8e3164b..310866871e 100644 --- a/lapack-netlib/SRC/cungl2.c +++ b/lapack-netlib/SRC/cungl2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungl2_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cungl2_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -632,11 +632,11 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *), - clacgv_(integer *, complex *, integer *), xerbla_(char *, integer - *, ftnlen); + clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -671,13 +671,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGL2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -738,7 +738,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of CUNGL2 */ diff --git a/lapack-netlib/SRC/cunglq.c b/lapack-netlib/SRC/cunglq.c index 89f296c51d..d51e1e5f97 100644 --- a/lapack-netlib/SRC/cunglq.c +++ b/lapack-netlib/SRC/cunglq.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunglq_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cunglq_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer * info) { @@ -652,15 +652,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int cungl2_(integer *, integer *, integer *, + extern /* Subroutine */ void cungl2_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -706,16 +707,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nbmin = 2; @@ -836,7 +837,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CUNGLQ */ diff --git a/lapack-netlib/SRC/cungql.c b/lapack-netlib/SRC/cungql.c index 335f3576eb..ab09d8ccfe 100644 --- a/lapack-netlib/SRC/cungql.c +++ b/lapack-netlib/SRC/cungql.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungql_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cungql_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer * info) { @@ -653,15 +653,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int cung2l_(integer *, integer *, integer *, + extern /* Subroutine */ void cung2l_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb, kk; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -718,15 +719,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } nbmin = 2; @@ -846,7 +847,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CUNGQL */ diff --git a/lapack-netlib/SRC/cungqr.c b/lapack-netlib/SRC/cungqr.c index 01205569e2..7da1ebb448 100644 --- a/lapack-netlib/SRC/cungqr.c +++ b/lapack-netlib/SRC/cungqr.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungqr_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cungqr_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer * info) { @@ -653,15 +653,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int cung2r_(integer *, integer *, integer *, + extern /* Subroutine */ void cung2r_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -707,16 +708,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nbmin = 2; @@ -837,7 +838,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CUNGQR */ diff --git a/lapack-netlib/SRC/cungr2.c b/lapack-netlib/SRC/cungr2.c index 6cc3481286..63d99e8547 100644 --- a/lapack-netlib/SRC/cungr2.c +++ b/lapack-netlib/SRC/cungr2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungr2_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cungr2_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *info) { /* System generated locals */ @@ -633,12 +633,12 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *), clarf_(char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *); integer ii; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -673,13 +673,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -738,7 +738,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of CUNGR2 */ diff --git a/lapack-netlib/SRC/cungrq.c b/lapack-netlib/SRC/cungrq.c index 3c12b02eb8..c257eeb8e8 100644 --- a/lapack-netlib/SRC/cungrq.c +++ b/lapack-netlib/SRC/cungrq.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungrq_(integer *m, integer *n, integer *k, complex *a, +/* Subroutine */ void cungrq_(integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer * info) { @@ -653,15 +653,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int cungr2_(integer *, integer *, integer *, + extern /* Subroutine */ void cungr2_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *); integer ib, nb, ii, kk; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nx; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -718,15 +719,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } nbmin = 2; @@ -846,7 +847,7 @@ f"> */ } work[1].r = (real) iws, work[1].i = 0.f; - return 0; + return; /* End of CUNGRQ */ diff --git a/lapack-netlib/SRC/cungtr.c b/lapack-netlib/SRC/cungtr.c index a2d1c83a1a..f640f9fed5 100644 --- a/lapack-netlib/SRC/cungtr.c +++ b/lapack-netlib/SRC/cungtr.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cungtr_(char *uplo, integer *n, complex *a, integer *lda, +/* Subroutine */ void cungtr_(char *uplo, integer *n, complex *a, integer *lda, complex *tau, complex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -652,7 +652,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cungql_(integer *, integer *, integer *, + extern /* Subroutine */ void cungql_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *), cungqr_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, integer *); @@ -719,16 +719,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGTR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (upper) { @@ -809,7 +809,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNGTR */ diff --git a/lapack-netlib/SRC/cungtsqr.c b/lapack-netlib/SRC/cungtsqr.c index c26ac117a6..7b8c1f674a 100644 --- a/lapack-netlib/SRC/cungtsqr.c +++ b/lapack-netlib/SRC/cungtsqr.c @@ -688,7 +688,7 @@ r.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int cungtsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void cungtsqr_(integer *m, integer *n, integer *mb, integer * nb, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *lwork, integer *info) { @@ -697,17 +697,17 @@ r.f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int clamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void clamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, complex *, integer *, complex * , integer *, complex *, integer *, complex *, integer *, integer * ); integer lworkopt, j, iinfo; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer lc, lw; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; integer ldc, nblocal; @@ -787,11 +787,11 @@ r.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* Quick return if possible */ @@ -799,7 +799,7 @@ r.f"> */ if (f2cmin(*m,*n) == 0) { q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in */ @@ -834,7 +834,7 @@ r.f"> */ q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CUNGTSQR */ diff --git a/lapack-netlib/SRC/cungtsqr_row.c b/lapack-netlib/SRC/cungtsqr_row.c index 9c644a67f2..02221c7c49 100644 --- a/lapack-netlib/SRC/cungtsqr_row.c +++ b/lapack-netlib/SRC/cungtsqr_row.c @@ -701,7 +701,7 @@ qr_row.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cungtsqr_row_(integer *m, integer *n, integer *mb, +/* Subroutine */ void cungtsqr_row_(integer *m, integer *n, integer *mb, integer *nb, complex *a, integer *lda, complex *t, integer *ldt, complex *work, integer *lwork, integer *info) { @@ -713,13 +713,13 @@ qr_row.f"> */ integer jb_t__, itmp, lworkopt; complex dummy[1] /* was [1][1] */; integer ib_bottom__, ib, kb; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer mb1, mb2, m_plus_one__; logical lquery; integer num_all_row_blocks__, imb, knb, nblocal, kb_last__; - extern /* Subroutine */ int clarfb_gett_(char *, integer *, integer *, + extern /* Subroutine */ void clarfb_gett_(char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); @@ -781,11 +781,11 @@ qr_row.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNGTSQR_ROW", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* Quick return if possible */ @@ -793,7 +793,7 @@ qr_row.f"> */ if (f2cmin(*m,*n) == 0) { q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; } /* (0) Set the upper-triangular part of the matrix A to zero and */ @@ -916,7 +916,7 @@ qr_row.f"> */ q__1.r = (real) lworkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CUNGTSQR_ROW */ diff --git a/lapack-netlib/SRC/cunhr_col.c b/lapack-netlib/SRC/cunhr_col.c index d0a54a33c7..f067559532 100644 --- a/lapack-netlib/SRC/cunhr_col.c +++ b/lapack-netlib/SRC/cunhr_col.c @@ -772,7 +772,7 @@ ol.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int cunhr_col_(integer *m, integer *n, integer *nb, complex +/* Subroutine */ void cunhr_col_(integer *m, integer *n, integer *nb, complex *a, integer *lda, complex *t, integer *ldt, complex *d__, integer * info) { @@ -781,13 +781,13 @@ ol.f"> */ complex q__1; /* Local variables */ - extern /* Subroutine */ int claunhr_col_getrfnp_(integer *, integer *, + extern /* Subroutine */ void claunhr_col_getrfnp_(integer *, integer *, complex *, integer *, complex *, integer *); integer nplusone, i__, j; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); integer iinfo; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *), ctrsm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); @@ -839,13 +839,13 @@ ol.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNHR_COL", &i__1, (ftnlen)9); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* On input, the M-by-N matrix A contains the unitary */ @@ -977,7 +977,7 @@ ol.f"> */ } - return 0; + return; /* End of CUNHR_COL */ diff --git a/lapack-netlib/SRC/cunm22.c b/lapack-netlib/SRC/cunm22.c index 79f509f703..91ae37fb7e 100644 --- a/lapack-netlib/SRC/cunm22.c +++ b/lapack-netlib/SRC/cunm22.c @@ -674,7 +674,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunm22_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunm22_(char *side, char *trans, integer *m, integer *n, integer *n1, integer *n2, complex *q, integer *ldq, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) { @@ -685,17 +685,17 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ctrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ctrmm_(char *, char *, char *, char *, integer *, integer *, complex *, complex *, integer *, complex *, integer *); integer nb, nq, nw; - extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex - *, integer *, complex *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork, lwkopt; logical lquery; @@ -772,16 +772,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNM22", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Degenerate cases (N1 = 0 or N2 = 0) are handled using CTRMM. */ @@ -790,12 +790,12 @@ f"> */ ctrmm_(side, "Upper", trans, "Non-Unit", m, n, &c_b1, &q[q_offset], ldq, &c__[c_offset], ldc); work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } else if (*n2 == 0) { ctrmm_(side, "Lower", trans, "Non-Unit", m, n, &c_b1, &q[q_offset], ldq, &c__[c_offset], ldc); work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } /* Compute the largest chunk size available from the workspace. */ @@ -980,7 +980,7 @@ f"> */ q__1.r = (real) lwkopt, q__1.i = 0.f; work[1].r = q__1.r, work[1].i = q__1.i; - return 0; + return; /* End of CUNM22 */ diff --git a/lapack-netlib/SRC/cunm2l.c b/lapack-netlib/SRC/cunm2l.c index efeb95ed98..004757b924 100644 --- a/lapack-netlib/SRC/cunm2l.c +++ b/lapack-netlib/SRC/cunm2l.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunm2l_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunm2l_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *info) { @@ -684,7 +684,7 @@ f"> */ logical left; complex taui; integer i__; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); integer i1, i2, i3, mi, ni, nq; @@ -744,13 +744,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNM2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -803,7 +803,7 @@ f"> */ a[i__3].r = aii.r, a[i__3].i = aii.i; /* L10: */ } - return 0; + return; /* End of CUNM2L */ diff --git a/lapack-netlib/SRC/cunm2r.c b/lapack-netlib/SRC/cunm2r.c index 8c7bcbe66c..0118b2da14 100644 --- a/lapack-netlib/SRC/cunm2r.c +++ b/lapack-netlib/SRC/cunm2r.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunm2r_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunm2r_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *info) { @@ -684,7 +684,7 @@ f"> */ logical left; complex taui; integer i__; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); integer i1, i2, i3, ic, jc, mi, ni, nq; @@ -744,13 +744,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNM2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -807,7 +807,7 @@ f"> */ a[i__3].r = aii.r, a[i__3].i = aii.i; /* L10: */ } - return 0; + return; /* End of CUNM2R */ diff --git a/lapack-netlib/SRC/cunmbr.c b/lapack-netlib/SRC/cunmbr.c index 92179fe055..34c478e457 100644 --- a/lapack-netlib/SRC/cunmbr.c +++ b/lapack-netlib/SRC/cunmbr.c @@ -711,7 +711,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmbr_(char *vect, char *side, char *trans, integer *m, +/* Subroutine */ void cunmbr_(char *vect, char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer * info) @@ -728,11 +728,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmlq_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); logical notran; - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); logical applyq; @@ -860,15 +860,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (applyq) { @@ -937,7 +937,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMBR */ diff --git a/lapack-netlib/SRC/cunmhr.c b/lapack-netlib/SRC/cunmhr.c index 740e3e93e4..c106711142 100644 --- a/lapack-netlib/SRC/cunmhr.c +++ b/lapack-netlib/SRC/cunmhr.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmhr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmhr_(char *side, char *trans, integer *m, integer *n, integer *ilo, integer *ihi, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer * info) @@ -710,7 +710,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; @@ -797,16 +797,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("CUNMHR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nh == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (left) { @@ -825,7 +825,7 @@ f"> */ tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMHR */ diff --git a/lapack-netlib/SRC/cunml2.c b/lapack-netlib/SRC/cunml2.c index 816b466b74..9e7c39b336 100644 --- a/lapack-netlib/SRC/cunml2.c +++ b/lapack-netlib/SRC/cunml2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunml2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunml2_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *info) { @@ -680,12 +680,12 @@ f"> */ logical left; complex taui; integer i__; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); integer i1, i2, i3, ic, jc, mi, ni, nq; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; complex aii; @@ -741,13 +741,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNML2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -812,7 +812,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of CUNML2 */ diff --git a/lapack-netlib/SRC/cunmlq.c b/lapack-netlib/SRC/cunmlq.c index d67b20ea64..573d946281 100644 --- a/lapack-netlib/SRC/cunmlq.c +++ b/lapack-netlib/SRC/cunmlq.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmlq_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int cunml2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunml2_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *); integer ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nq, nw; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -795,15 +796,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } /* Determine the block size */ @@ -895,7 +896,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMLQ */ diff --git a/lapack-netlib/SRC/cunmql.c b/lapack-netlib/SRC/cunmql.c index cb4df22254..1b6e0b2940 100644 --- a/lapack-netlib/SRC/cunmql.c +++ b/lapack-netlib/SRC/cunmql.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmql_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmql_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int cunm2l_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunm2l_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *); integer ib, nb, mi, ni; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nq, nw; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -793,15 +794,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size */ @@ -883,7 +884,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMQL */ diff --git a/lapack-netlib/SRC/cunmqr.c b/lapack-netlib/SRC/cunmqr.c index 500ca0d7ae..c823aec11a 100644 --- a/lapack-netlib/SRC/cunmqr.c +++ b/lapack-netlib/SRC/cunmqr.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmqr_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int cunm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunm2r_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *); integer ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nq, nw; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -789,16 +790,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } nbmin = 2; @@ -882,7 +883,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMQR */ diff --git a/lapack-netlib/SRC/cunmr2.c b/lapack-netlib/SRC/cunmr2.c index 6e0f8e84a5..dc9afb06ae 100644 --- a/lapack-netlib/SRC/cunmr2.c +++ b/lapack-netlib/SRC/cunmr2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmr2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmr2_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *info) { @@ -680,12 +680,12 @@ f"> */ logical left; complex taui; integer i__; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); integer i1, i2, i3, mi, ni, nq; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; complex aii; @@ -741,13 +741,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -804,7 +804,7 @@ f"> */ clacgv_(&i__3, &a[i__ + a_dim1], lda); /* L10: */ } - return 0; + return; /* End of CUNMR2 */ diff --git a/lapack-netlib/SRC/cunmr3.c b/lapack-netlib/SRC/cunmr3.c index a0e38ad76f..a6170360c9 100644 --- a/lapack-netlib/SRC/cunmr3.c +++ b/lapack-netlib/SRC/cunmr3.c @@ -687,7 +687,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunmr3_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmr3_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *info) { @@ -700,7 +700,7 @@ f"> */ complex taui; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int clarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void clarz_(char *, integer *, integer *, integer * , complex *, integer *, complex *, complex *, integer *, complex * ); integer i1, i2, i3, ja, ic, jc, mi, ni, nq; @@ -761,13 +761,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMR3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -822,7 +822,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of CUNMR3 */ diff --git a/lapack-netlib/SRC/cunmrq.c b/lapack-netlib/SRC/cunmrq.c index 40fa5baaf2..9722422321 100644 --- a/lapack-netlib/SRC/cunmrq.c +++ b/lapack-netlib/SRC/cunmrq.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmrq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmrq_(char *side, char *trans, integer *m, integer *n, integer *k, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int cunmr2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmr2_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *); integer ib, nb, mi, ni; - extern /* Subroutine */ int clarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarfb_(char *, char *, char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); integer nq, nw; - extern /* Subroutine */ int clarft_(char *, char *, integer *, integer *, - complex *, integer *, complex *, complex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void clarft_(char *, char *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -795,15 +796,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -889,7 +890,7 @@ f"> */ } } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMRQ */ diff --git a/lapack-netlib/SRC/cunmrz.c b/lapack-netlib/SRC/cunmrz.c index 2e8c13a984..fd800099b8 100644 --- a/lapack-netlib/SRC/cunmrz.c +++ b/lapack-netlib/SRC/cunmrz.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int cunmrz_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void cunmrz_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer * info) @@ -718,16 +718,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int cunmr3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmr3_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *); integer ib, ic, ja, jc, nb, mi, ni, nq, nw; - extern /* Subroutine */ int clarzb_(char *, char *, char *, char *, + extern /* Subroutine */ void clarzb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, integer *, complex *, integer *, complex *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), clarzt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clarzt_( char *, char *, integer *, integer *, complex *, integer *, complex *, complex *, integer *); logical notran; @@ -817,15 +818,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUNMRZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size. */ @@ -930,7 +931,7 @@ f"> */ work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMRZ */ diff --git a/lapack-netlib/SRC/cunmtr.c b/lapack-netlib/SRC/cunmtr.c index 2fe181f6c9..d741e1d244 100644 --- a/lapack-netlib/SRC/cunmtr.c +++ b/lapack-netlib/SRC/cunmtr.c @@ -686,7 +686,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cunmtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void cunmtr_(char *side, char *uplo, char *trans, integer *m, integer *n, complex *a, integer *lda, complex *tau, complex *c__, integer *ldc, complex *work, integer *lwork, integer *info) { @@ -704,7 +704,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int cunmql_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cunmql_(char *, char *, integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *, complex *, integer *, integer *), cunmqr_(char *, char *, integer *, integer *, integer *, complex *, integer *, @@ -817,16 +817,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("CUNMTR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nq == 1) { work[1].r = 1.f, work[1].i = 0.f; - return 0; + return; } if (left) { @@ -860,7 +860,7 @@ f"> */ c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); } work[1].r = (real) lwkopt, work[1].i = 0.f; - return 0; + return; /* End of CUNMTR */ diff --git a/lapack-netlib/SRC/cupgtr.c b/lapack-netlib/SRC/cupgtr.c index e61c2dec0f..208d309915 100644 --- a/lapack-netlib/SRC/cupgtr.c +++ b/lapack-netlib/SRC/cupgtr.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cupgtr_(char *uplo, integer *n, complex *ap, complex * +/* Subroutine */ void cupgtr_(char *uplo, integer *n, complex *ap, complex * tau, complex *q, integer *ldq, complex *work, integer *info) { /* System generated locals */ @@ -634,7 +634,7 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; logical upper; - extern /* Subroutine */ int cung2l_(integer *, integer *, integer *, + extern /* Subroutine */ void cung2l_(integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *), cung2r_( integer *, integer *, integer *, complex *, integer *, complex *, complex *, integer *); @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUPGTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -767,7 +767,7 @@ f"> */ &work[1], &iinfo); } } - return 0; + return; /* End of CUPGTR */ diff --git a/lapack-netlib/SRC/cupmtr.c b/lapack-netlib/SRC/cupmtr.c index a66b4fff59..6fc539768e 100644 --- a/lapack-netlib/SRC/cupmtr.c +++ b/lapack-netlib/SRC/cupmtr.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int cupmtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void cupmtr_(char *side, char *uplo, char *trans, integer *m, integer *n, complex *ap, complex *tau, complex *c__, integer *ldc, complex *work, integer *info) { @@ -674,7 +674,7 @@ f"> */ logical left; complex taui; integer i__; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + extern /* Subroutine */ void clarf_(char *, integer *, integer *, complex * , integer *, complex *, complex *, integer *, complex *); extern logical lsame_(char *, char *); integer i1; @@ -733,13 +733,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("CUPMTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (upper) { @@ -875,7 +875,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of CUPMTR */ diff --git a/lapack-netlib/SRC/dbbcsd.c b/lapack-netlib/SRC/dbbcsd.c index c9e56314eb..efc6969250 100644 --- a/lapack-netlib/SRC/dbbcsd.c +++ b/lapack-netlib/SRC/dbbcsd.c @@ -844,7 +844,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void dbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, integer *m, integer *p, integer *q, doublereal * theta, doublereal *phi, doublereal *u1, integer *ldu1, doublereal *u2, integer *ldu2, doublereal *v1t, integer *ldv1t, doublereal *v2t, @@ -864,14 +864,14 @@ f"> */ logical colmajor; doublereal thetamin, thetamax; logical restart11, restart12, restart21, restart22; - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer lworkmin, iu1cs, iu2cs, iu1sn, iu2sn, lworkopt, i__, j; doublereal r__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer * , doublereal *, integer *); integer maxit; @@ -883,12 +883,12 @@ f"> */ doublereal mu, nu, sigma11, sigma21; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal thresh, tolmul; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery; doublereal b11bulge; logical wantv1t, wantv2t; doublereal b12bulge, b21bulge, b22bulge, eps, tol; - extern /* Subroutine */ int dlartgp_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartgp_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlartgs_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -962,7 +962,7 @@ f"> */ if (*info == 0 && *q == 0) { lworkmin = 1; work[1] = (doublereal) lworkmin; - return 0; + return; } /* Compute workspace */ @@ -987,9 +987,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DBBCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Get machine constants */ @@ -1081,7 +1081,7 @@ f"> */ ++(*info); } } - return 0; + return; } iter = iter + imax - imin; @@ -1792,7 +1792,7 @@ f"> */ } - return 0; + return; /* End of DBBCSD */ diff --git a/lapack-netlib/SRC/dbdsdc.c b/lapack-netlib/SRC/dbdsdc.c index ac79c8e656..5c361289a6 100644 --- a/lapack-netlib/SRC/dbdsdc.c +++ b/lapack-netlib/SRC/dbdsdc.c @@ -722,7 +722,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dbdsdc_(char *uplo, char *compq, integer *n, doublereal * +/* Subroutine */ void dbdsdc_(char *uplo, char *compq, integer *n, doublereal * d__, doublereal *e, doublereal *u, integer *ldu, doublereal *vt, integer *ldvt, doublereal *q, integer *iq, doublereal *work, integer * iwork, integer *info) @@ -736,18 +736,18 @@ f"> */ doublereal p, r__; integer z__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer * , doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer poles, iuplo, nsize, start; - extern /* Subroutine */ int dlasd0_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlasd0_(integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *); integer ic, ii, kk; doublereal cs; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlasda_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasda_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, @@ -755,7 +755,7 @@ f"> */ integer *); integer is, iu; doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlasdq_(char *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, @@ -837,13 +837,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DBDSDC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } smlsiz = ilaenv_(&c__9, "DBDSDC", " ", &c__0, &c__0, &c__0, &c__0, ( ftnlen)6, (ftnlen)1); @@ -856,7 +856,7 @@ f"> */ vt[vt_dim1 + 1] = 1.; } d__[1] = abs(d__[1]); - return 0; + return; } nm1 = *n - 1; @@ -935,7 +935,7 @@ f"> */ orgnrm = dlanst_("M", n, &d__[1], &e[1]); if (orgnrm == 0.) { - return 0; + return; } dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, &ierr); dlascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1, & @@ -1026,7 +1026,7 @@ f"> */ iwork[1], info); } if (*info != 0) { - return 0; + return; } start = i__ + 1; } @@ -1086,7 +1086,7 @@ f"> */ dlasr_("L", "V", "B", n, n, &work[1], &work[*n], &u[u_offset], ldu); } - return 0; + return; /* End of DBDSDC */ diff --git a/lapack-netlib/SRC/dbdsqr.c b/lapack-netlib/SRC/dbdsqr.c index 61b8200f68..a79cf8585a 100644 --- a/lapack-netlib/SRC/dbdsqr.c +++ b/lapack-netlib/SRC/dbdsqr.c @@ -756,7 +756,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dbdsqr_(char *uplo, integer *n, integer *ncvt, integer * +/* Subroutine */ void dbdsqr_(char *uplo, integer *n, integer *ncvt, integer * nru, integer *ncc, doublereal *d__, doublereal *e, doublereal *vt, integer *ldvt, doublereal *u, integer *ldu, doublereal *c__, integer * ldc, doublereal *work, integer *info) @@ -774,28 +774,28 @@ f"> */ doublereal cosl; integer isub, iter; doublereal unfl, sinl, cosr, smin, smax, sinr; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer iterdivn; - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal f, g, h__; integer i__, j, m; doublereal r__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal oldcs; - extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer oldll; doublereal shift, sigmn, oldsn; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal sminl, sigmx; logical lower; integer maxitdivn; - extern /* Subroutine */ int dlasq1_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlasq1_(integer *, doublereal *, doublereal *, doublereal *, integer *), dlasv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -803,9 +803,9 @@ f"> */ integer ll; extern doublereal dlamch_(char *); doublereal sn, mu; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal sminoa, thresh; logical rotate; integer nm1; @@ -862,10 +862,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DBDSQR", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } if (*n == 1) { goto L160; @@ -883,7 +883,7 @@ f"> */ /* If INFO equals 2, dqds didn't finish, try to finish */ if (*info != 2) { - return 0; + return; } *info = 0; } @@ -1513,7 +1513,7 @@ f"> */ /* L210: */ } L220: - return 0; + return; /* End of DBDSQR */ diff --git a/lapack-netlib/SRC/dbdsvdx.c b/lapack-netlib/SRC/dbdsvdx.c index d4c00d510e..5ed8353bad 100644 --- a/lapack-netlib/SRC/dbdsvdx.c +++ b/lapack-netlib/SRC/dbdsvdx.c @@ -742,7 +742,7 @@ static integer c__2 = 2; /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dbdsvdx_(char *uplo, char *jobz, char *range, integer *n, +/* Subroutine */ void dbdsvdx_(char *uplo, char *jobz, char *range, integer *n, doublereal *d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, integer *ns, doublereal *s, doublereal *z__, integer *ldz, doublereal *work, integer *iwork, integer *info) @@ -762,23 +762,23 @@ static integer c__2 = 2; integer i__, idbeg, j, k; doublereal sqrt2; integer idend; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer isbeg; extern logical lsame_(char *, char *); integer idtgk, ietgk, iltgk, itemp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer icolz; logical allsv; integer idptr; logical indsv; integer ieptr, iutgk; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal vltgk; logical lower; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal zjtji; logical split, valsv; @@ -791,12 +791,12 @@ static integer c__2 = 2; integer iifail; doublereal mu; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal abstol, thresh; integer iiwork; - extern /* Subroutine */ int dstevx_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstevx_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), @@ -868,14 +868,14 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("DBDSVDX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible (N.LE.1) */ *ns = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -892,7 +892,7 @@ static integer c__2 = 2; z__[z_dim1 + 1] = d_sign(&c_b10, &d__[1]); z__[z_dim1 + 2] = 1.; } - return 0; + return; } abstol = dlamch_("Safe Minimum") * 2; @@ -1007,7 +1007,7 @@ static integer c__2 = 2; iltgk, &iltgk, &abstol, ns, &s[1], &z__[z_offset], ldz, &work[ itemp], &iwork[iiwork], &iwork[iifail], info); if (*ns == 0) { - return 0; + return; } else { if (wantz) { i__1 = *n << 1; @@ -1208,7 +1208,7 @@ static integer c__2 = 2; , &iwork[iifail], info); if (*info != 0) { /* Exit with the error code from DSTEVX. */ - return 0; + return; } emin = (d__1 = s[isbeg], abs(d__1)); i__3 = isbeg + nsl - 1; @@ -1262,7 +1262,7 @@ static integer c__2 = 2; z_dim1], &c__2); if (nrmu == 0.) { *info = (*n << 1) + 1; - return 0; + return; } d__1 = 1. / nrmu; dscal_(&nru, &d__1, &z__[irowu + (icolz + i__) * @@ -1293,7 +1293,7 @@ static integer c__2 = 2; z_dim1], &c__2); if (nrmv == 0.) { *info = (*n << 1) + 1; - return 0; + return; } d__1 = -1. / nrmv; dscal_(&nrv, &d__1, &z__[irowv + (icolz + i__) * @@ -1465,7 +1465,7 @@ static integer c__2 = 2; } } - return 0; + return; /* End of DBDSVDX */ diff --git a/lapack-netlib/SRC/dcombssq.c b/lapack-netlib/SRC/dcombssq.c index b07c400027..d3bb7ac7f7 100644 --- a/lapack-netlib/SRC/dcombssq.c +++ b/lapack-netlib/SRC/dcombssq.c @@ -568,7 +568,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dcombssq_(doublereal *v1, doublereal *v2) +/* Subroutine */ void dcombssq_(doublereal *v1, doublereal *v2) { /* System generated locals */ doublereal d__1; @@ -602,7 +602,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ v1[2] = v2[2] + d__1 * d__1 * v1[2]; v1[1] = v2[1]; } - return 0; + return; /* End of DCOMBSSQ */ diff --git a/lapack-netlib/SRC/ddisna.c b/lapack-netlib/SRC/ddisna.c index ce62c69cbd..c656acc67c 100644 --- a/lapack-netlib/SRC/ddisna.c +++ b/lapack-netlib/SRC/ddisna.c @@ -626,7 +626,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ddisna_(char *job, integer *m, integer *n, doublereal * +/* Subroutine */ void ddisna_(char *job, integer *m, integer *n, doublereal * d__, doublereal *sep, integer *info) { /* System generated locals */ @@ -706,13 +706,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DDISNA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } /* Compute reciprocal condition numbers */ @@ -767,7 +767,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of DDISNA */ diff --git a/lapack-netlib/SRC/dgbbrd.c b/lapack-netlib/SRC/dgbbrd.c index 2f8268639f..e4375607f6 100644 --- a/lapack-netlib/SRC/dgbbrd.c +++ b/lapack-netlib/SRC/dgbbrd.c @@ -701,7 +701,7 @@ f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbbrd_(char *vect, integer *m, integer *n, integer *ncc, +/* Subroutine */ void dgbbrd_(char *vect, integer *m, integer *n, integer *ncc, integer *kl, integer *ku, doublereal *ab, integer *ldab, doublereal * d__, doublereal *e, doublereal *q, integer *ldq, doublereal *pt, integer *ldpt, doublereal *c__, integer *ldc, doublereal *work, @@ -713,7 +713,7 @@ f"> */ /* Local variables */ integer inca; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer i__, j, l; extern logical lsame_(char *, char *); @@ -726,10 +726,12 @@ f"> */ doublereal rc; integer ml, mn, nr, mu; doublereal rs; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen), dlargv_( + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlargv_( integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dlartv_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -798,7 +800,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBBRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and P**T to the unit matrix, if needed */ @@ -813,7 +815,7 @@ f"> */ /* Quick return if possible. */ if (*m == 0 || *n == 0) { - return 0; + return; } minmn = f2cmin(*m,*n); @@ -1149,7 +1151,7 @@ f"> */ /* L150: */ } } - return 0; + return; /* End of DGBBRD */ diff --git a/lapack-netlib/SRC/dgbcon.c b/lapack-netlib/SRC/dgbcon.c index 7c86f1376b..19d0c3938f 100644 --- a/lapack-netlib/SRC/dgbcon.c +++ b/lapack-netlib/SRC/dgbcon.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbcon_(char *norm, integer *n, integer *kl, integer *ku, +/* Subroutine */ void dgbcon_(char *norm, integer *n, integer *kl, integer *ku, doublereal *ab, integer *ldab, integer *ipiv, doublereal *anorm, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -674,10 +674,10 @@ f"> */ doublereal t, scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); logical lnoti; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -685,9 +685,10 @@ f"> */ extern doublereal dlamch_(char *); integer lm, jp, ix; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatbs_(char *, char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; logical onenrm; char normin[1]; @@ -732,7 +733,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -740,9 +741,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -841,7 +842,7 @@ f"> */ } L40: - return 0; + return; /* End of DGBCON */ diff --git a/lapack-netlib/SRC/dgbequ.c b/lapack-netlib/SRC/dgbequ.c index 4dc01b2715..5801abd8a1 100644 --- a/lapack-netlib/SRC/dgbequ.c +++ b/lapack-netlib/SRC/dgbequ.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbequ_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void dgbequ_(integer *m, integer *n, integer *kl, integer *ku, doublereal *ab, integer *ldab, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer * info) @@ -713,7 +713,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -722,7 +722,7 @@ f"> */ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. */ @@ -782,7 +782,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -857,7 +857,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -880,7 +880,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of DGBEQU */ diff --git a/lapack-netlib/SRC/dgbequb.c b/lapack-netlib/SRC/dgbequb.c index 9273800988..f7f8adca55 100644 --- a/lapack-netlib/SRC/dgbequb.c +++ b/lapack-netlib/SRC/dgbequb.c @@ -668,7 +668,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbequb_(integer *m, integer *n, integer *kl, integer * +/* Subroutine */ void dgbequb_(integer *m, integer *n, integer *kl, integer * ku, doublereal *ab, integer *ldab, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer * info) @@ -720,7 +720,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGBEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -729,7 +729,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -798,7 +798,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -876,7 +876,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -899,7 +899,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of DGBEQUB */ diff --git a/lapack-netlib/SRC/dgbrfs.c b/lapack-netlib/SRC/dgbrfs.c index 716991e4b9..6adfe7428b 100644 --- a/lapack-netlib/SRC/dgbrfs.c +++ b/lapack-netlib/SRC/dgbrfs.c @@ -718,7 +718,7 @@ f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbrfs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void dgbrfs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, integer *ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, @@ -734,23 +734,24 @@ f"> */ doublereal safe1, safe2; integer i__, j, k; doublereal s; - extern /* Subroutine */ int dgbmv_(char *, integer *, integer *, integer * + extern /* Subroutine */ void dgbmv_(char *, integer *, integer *, integer * , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer kk; extern doublereal dlamch_(char *); doublereal xk; integer nz; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dgbtrs_( char *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical notran; @@ -814,7 +815,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -826,7 +827,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1035,7 +1036,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DGBRFS */ diff --git a/lapack-netlib/SRC/dgbrfsx.c b/lapack-netlib/SRC/dgbrfsx.c index f4aa619d45..e088a7892a 100644 --- a/lapack-netlib/SRC/dgbrfsx.c +++ b/lapack-netlib/SRC/dgbrfsx.c @@ -848,7 +848,7 @@ static integer c__1 = 1; /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbrfsx_(char *trans, char *equed, integer *n, integer * +/* Subroutine */ void dgbrfsx_(char *trans, char *equed, integer *n, integer * kl, integer *ku, integer *nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, integer *ipiv, doublereal *r__, doublereal *c__, doublereal *b, integer *ldb, doublereal *x, integer * @@ -875,10 +875,10 @@ static integer c__1 = 1; integer prec_type__; extern doublereal dlamch_(char *), dlangb_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgbcon_(char *, integer *, integer *, integer + extern /* Subroutine */ void dgbcon_(char *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, - doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical colequ, notran, rowequ; integer trans_type__; extern doublereal dla_gbrcond_(char *, integer *, integer *, integer *, @@ -887,7 +887,7 @@ static integer c__1 = 1; extern integer ilaprec_(char *); integer ithresh, n_norms__; doublereal rthresh, cwise_wrong__; - extern /* Subroutine */ int dla_gbrfsx_extended_(integer *, integer *, + extern /* Subroutine */ void dla_gbrfsx_extended_(integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, logical *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -1010,7 +1010,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DGBRFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -1033,7 +1033,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.; } } - return 0; + return; } /* Default to failure. */ @@ -1194,7 +1194,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of DGBRFSX */ diff --git a/lapack-netlib/SRC/dgbsv.c b/lapack-netlib/SRC/dgbsv.c index 3ee2d6d822..80d3abdc7e 100644 --- a/lapack-netlib/SRC/dgbsv.c +++ b/lapack-netlib/SRC/dgbsv.c @@ -672,7 +672,7 @@ e driver) */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgbsv_(integer *n, integer *kl, integer *ku, integer * +/* Subroutine */ void dgbsv_(integer *n, integer *kl, integer *ku, integer * nrhs, doublereal *ab, integer *ldab, integer *ipiv, doublereal *b, integer *ldb, integer *info) { @@ -680,9 +680,10 @@ e driver) */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dgbtrf_(integer *, integer *, integer *, - integer *, doublereal *, integer *, integer *, integer *), - xerbla_(char *, integer *, ftnlen), dgbtrs_(char *, integer *, + extern /* Subroutine */ void dgbtrf_(integer *, integer *, integer *, + integer *, doublereal *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dgbtrs_(char *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); @@ -725,7 +726,7 @@ e driver) */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the LU factorization of the band matrix A. */ @@ -738,7 +739,7 @@ e driver) */ dgbtrs_("No transpose", n, kl, ku, nrhs, &ab[ab_offset], ldab, &ipiv[ 1], &b[b_offset], ldb, info); } - return 0; + return; /* End of DGBSV */ diff --git a/lapack-netlib/SRC/dgbsvx.c b/lapack-netlib/SRC/dgbsvx.c index a9e598ffd1..d3bcb47376 100644 --- a/lapack-netlib/SRC/dgbsvx.c +++ b/lapack-netlib/SRC/dgbsvx.c @@ -880,7 +880,7 @@ f"> */ /* > \ingroup doubleGBsolve */ /* ===================================================================== */ -/* Subroutine */ int dgbsvx_(char *fact, char *trans, integer *n, integer *kl, +/* Subroutine */ void dgbsvx_(char *fact, char *trans, integer *n, integer *kl, integer *ku, integer *nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublereal *b, integer *ldb, @@ -898,13 +898,13 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); doublereal rcmin, rcmax, anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical equil; integer j1, j2; extern doublereal dlamch_(char *), dlangb_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlaqgb_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaqgb_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, char *), dgbcon_(char *, integer *, integer *, integer *, doublereal *, @@ -913,7 +913,7 @@ f"> */ doublereal colcnd; extern doublereal dlantb_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgbequ_(integer *, integer *, integer *, + extern /* Subroutine */ void dgbequ_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dgbrfs_( char *, integer *, integer *, integer *, integer *, doublereal *, @@ -923,11 +923,11 @@ f"> */ integer *, integer *, integer *, doublereal *, integer *, integer *, integer *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void dgbtrs_(char *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer infequ; @@ -1064,7 +1064,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1170,7 +1170,7 @@ f"> */ } work[1] = rpvgrw; *rcond = 0.; - return 0; + return; } } @@ -1254,7 +1254,7 @@ f"> */ } work[1] = rpvgrw; - return 0; + return; /* End of DGBSVX */ diff --git a/lapack-netlib/SRC/dgbsvxx.c b/lapack-netlib/SRC/dgbsvxx.c index 2f1ae1505b..2e130bf2cd 100644 --- a/lapack-netlib/SRC/dgbsvxx.c +++ b/lapack-netlib/SRC/dgbsvxx.c @@ -1065,7 +1065,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGBsolve */ /* ===================================================================== */ -/* Subroutine */ int dgbsvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void dgbsvxx_(char *fact, char *trans, integer *n, integer * kl, integer *ku, integer *nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublereal *b, integer *ldb, @@ -1089,18 +1089,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ doublereal rcmin, rcmax; logical equil; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlaqgb_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaqgb_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, char *); doublereal colcnd; - extern /* Subroutine */ int dgbtrf_(integer *, integer *, integer *, + extern /* Subroutine */ void dgbtrf_(integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void dgbtrs_(char *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer infequ; @@ -1109,7 +1109,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical notran; doublereal smlnum; logical rowequ; - extern /* Subroutine */ int dlascl2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlascl2_(integer *, integer *, doublereal *, doublereal *, integer *), dgbequb_(integer *, integer *, integer * , integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dgbrfsx_( @@ -1258,7 +1258,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGBSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1332,7 +1332,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = dla_gbrpvgrw_(n, kl, ku, info, &ab[ab_offset], ldab, & afb[afb_offset], ldafb); - return 0; + return; } } @@ -1365,7 +1365,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dlascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of DGBSVXX */ diff --git a/lapack-netlib/SRC/dgbtf2.c b/lapack-netlib/SRC/dgbtf2.c index e779512324..e171be8b37 100644 --- a/lapack-netlib/SRC/dgbtf2.c +++ b/lapack-netlib/SRC/dgbtf2.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgbtf2_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void dgbtf2_(integer *m, integer *n, integer *kl, integer *ku, doublereal *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -668,11 +668,11 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer km, jp, ju, kv; @@ -718,13 +718,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Gaussian elimination with partial pivoting */ @@ -814,7 +814,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of DGBTF2 */ diff --git a/lapack-netlib/SRC/dgbtrf.c b/lapack-netlib/SRC/dgbtrf.c index 600819436d..47b5a03656 100644 --- a/lapack-netlib/SRC/dgbtrf.c +++ b/lapack-netlib/SRC/dgbtrf.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgbtrf_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void dgbtrf_(integer *m, integer *n, integer *kl, integer *ku, doublereal *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -668,12 +668,12 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal temp; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_( @@ -682,11 +682,11 @@ f"> */ ); doublereal work13[4160] /* was [65][64] */, work31[4160] /* was [65][64] */; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i2, i3, j2, j3, k2; - extern /* Subroutine */ int dgbtf2_(integer *, integer *, integer *, + extern /* Subroutine */ void dgbtf2_(integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *); integer jb, nb, ii, jj, jm, ip, jp, km, ju, kv; extern integer idamax_(integer *, doublereal *, integer *); @@ -736,13 +736,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1137,7 +1137,7 @@ f"> */ } } - return 0; + return; /* End of DGBTRF */ diff --git a/lapack-netlib/SRC/dgbtrs.c b/lapack-netlib/SRC/dgbtrs.c index 305b19c0e9..57468ed013 100644 --- a/lapack-netlib/SRC/dgbtrs.c +++ b/lapack-netlib/SRC/dgbtrs.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgbtrs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void dgbtrs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, doublereal *ab, integer *ldab, integer *ipiv, doublereal *b, integer *ldb, integer *info) { @@ -660,12 +660,12 @@ f"> */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, j, l; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dtbsv_(char *, @@ -719,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } kd = *ku + *kl + 1; @@ -802,7 +802,7 @@ f"> */ } } } - return 0; + return; /* End of DGBTRS */ diff --git a/lapack-netlib/SRC/dgebak.c b/lapack-netlib/SRC/dgebak.c index d5e07e2076..f412bfe920 100644 --- a/lapack-netlib/SRC/dgebak.c +++ b/lapack-netlib/SRC/dgebak.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgebak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void dgebak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, doublereal *scale, integer *m, doublereal *v, integer * ldv, integer *info) { @@ -648,10 +648,10 @@ f"> */ /* Local variables */ integer i__, k; doublereal s; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical leftv; integer ii; @@ -700,19 +700,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -791,7 +791,7 @@ f"> */ } } - return 0; + return; /* End of DGEBAK */ diff --git a/lapack-netlib/SRC/dgebak.f b/lapack-netlib/SRC/dgebak.f index e978d7af29..9c086794a4 100644 --- a/lapack-netlib/SRC/dgebak.f +++ b/lapack-netlib/SRC/dgebak.f @@ -236,7 +236,7 @@ SUBROUTINE DGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -250,7 +250,7 @@ SUBROUTINE DGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/dgebal.c b/lapack-netlib/SRC/dgebal.c index 370144c31d..869c3e7de2 100644 --- a/lapack-netlib/SRC/dgebal.c +++ b/lapack-netlib/SRC/dgebal.c @@ -673,7 +673,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgebal_(char *job, integer *n, doublereal *a, integer * +/* Subroutine */ void dgebal_(char *job, integer *n, doublereal *a, integer * lda, integer *ilo, integer *ihi, doublereal *scale, integer *info) { /* System generated locals */ @@ -686,10 +686,10 @@ f"> */ doublereal c__, f, g; integer i__, j, k, l, m; doublereal r__, s; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal sfmin1, sfmin2, sfmax1, sfmax2, ca, ra; extern doublereal dlamch_(char *); @@ -729,7 +729,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEBAL", &i__1, (ftnlen)6); - return 0; + return; } k = 1; @@ -894,7 +894,7 @@ f"> */ *info = -3; i__2 = -(*info); xerbla_("DGEBAL", &i__2, (ftnlen)6); - return 0; + return; } f *= 2.; c__ *= 2.; @@ -956,7 +956,7 @@ f"> */ *ilo = k; *ihi = l; - return 0; + return; /* End of DGEBAL */ diff --git a/lapack-netlib/SRC/dgebd2.c b/lapack-netlib/SRC/dgebd2.c index 88dfd9a2fa..94496ba240 100644 --- a/lapack-netlib/SRC/dgebd2.c +++ b/lapack-netlib/SRC/dgebd2.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgebd2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgebd2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal * taup, doublereal *work, integer *info) { @@ -711,10 +711,11 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.1) -- */ @@ -750,7 +751,7 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("DGEBD2", &i__1, (ftnlen)6); - return 0; + return; } if (*m >= *n) { @@ -861,7 +862,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of DGEBD2 */ diff --git a/lapack-netlib/SRC/dgebrd.c b/lapack-netlib/SRC/dgebrd.c index 2ca2899598..1589355e4f 100644 --- a/lapack-netlib/SRC/dgebrd.c +++ b/lapack-netlib/SRC/dgebrd.c @@ -722,7 +722,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgebrd_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgebrd_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal * taup, doublereal *work, integer *lwork, integer *info) { @@ -731,15 +731,15 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nbmin, iinfo, minmn; - extern /* Subroutine */ int dgebd2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgebd2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); integer nb; - extern /* Subroutine */ int dlabrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dlabrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer nx, ws; @@ -796,9 +796,9 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("DGEBRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -806,7 +806,7 @@ f"> */ minmn = f2cmin(*m,*n); if (minmn == 0) { work[1] = 1.; - return 0; + return; } ws = f2cmax(*m,*n); @@ -900,7 +900,7 @@ f"> */ dgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], & tauq[i__], &taup[i__], &work[1], &iinfo); work[1] = (doublereal) ws; - return 0; + return; /* End of DGEBRD */ diff --git a/lapack-netlib/SRC/dgecon.c b/lapack-netlib/SRC/dgecon.c index 77a991a067..f769191758 100644 --- a/lapack-netlib/SRC/dgecon.c +++ b/lapack-netlib/SRC/dgecon.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgecon_(char *norm, integer *n, doublereal *a, integer * +/* Subroutine */ void dgecon_(char *norm, integer *n, doublereal *a, integer * lda, doublereal *anorm, doublereal *rcond, doublereal *work, integer * iwork, integer *info) { @@ -649,7 +649,7 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); @@ -659,7 +659,7 @@ f"> */ doublereal su; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatrs_(char *, char *, char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); logical onenrm; @@ -700,7 +700,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGECON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -708,9 +708,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -774,7 +774,7 @@ f"> */ } L20: - return 0; + return; /* End of DGECON */ diff --git a/lapack-netlib/SRC/dgecon.f b/lapack-netlib/SRC/dgecon.f index aa10dee9a2..1ad302ae3f 100644 --- a/lapack-netlib/SRC/dgecon.f +++ b/lapack-netlib/SRC/dgecon.f @@ -106,6 +106,7 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value +*> =-5: if ANORM is NAN or negative. *> \endverbatim * * Authors: @@ -152,10 +153,10 @@ SUBROUTINE DGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, INTEGER ISAVE( 3 ) * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN INTEGER IDAMAX DOUBLE PRECISION DLAMCH - EXTERNAL LSAME, IDAMAX, DLAMCH + EXTERNAL LSAME, IDAMAX, DLAMCH, DISNAN * .. * .. External Subroutines .. EXTERNAL DLACN2, DLATRS, DRSCL, XERBLA @@ -175,7 +176,7 @@ SUBROUTINE DGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO ) THEN + ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/dgeequ.c b/lapack-netlib/SRC/dgeequ.c index 0561cdab14..90e98664f1 100644 --- a/lapack-netlib/SRC/dgeequ.c +++ b/lapack-netlib/SRC/dgeequ.c @@ -647,7 +647,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgeequ_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeequ_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer *info) { @@ -693,7 +693,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -702,7 +702,7 @@ f"> */ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. */ @@ -756,7 +756,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -826,7 +826,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -849,7 +849,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of DGEEQU */ diff --git a/lapack-netlib/SRC/dgeequb.c b/lapack-netlib/SRC/dgeequb.c index d76d611f20..e798545406 100644 --- a/lapack-netlib/SRC/dgeequb.c +++ b/lapack-netlib/SRC/dgeequb.c @@ -654,7 +654,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgeequb_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeequb_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer *info) { @@ -700,7 +700,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGEEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -709,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -772,7 +772,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -846,7 +846,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -869,7 +869,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of DGEEQUB */ diff --git a/lapack-netlib/SRC/dgees.c b/lapack-netlib/SRC/dgees.c index 636b450a5e..4c99462276 100644 --- a/lapack-netlib/SRC/dgees.c +++ b/lapack-netlib/SRC/dgees.c @@ -729,7 +729,7 @@ or GE matrices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgees_(char *jobvs, char *sort, L_fp select, integer *n, +/* Subroutine */ void dgees_(char *jobvs, char *sort, L_fp select, integer *n, doublereal *a, integer *lda, integer *sdim, doublereal *wr, doublereal *wi, doublereal *vs, integer *ldvs, doublereal *work, integer *lwork, logical *bwork, integer *info) @@ -744,12 +744,12 @@ or GE matrices */ doublereal s; integer icond, ieval; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical cursl; integer i1, i2; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dgebak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dgebak_( char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dgebal_(char *, integer *, doublereal *, integer *, integer *, @@ -759,17 +759,17 @@ or GE matrices */ doublereal cscale; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dgehrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; - extern /* Subroutine */ int dorghr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorghr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dhseqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -874,16 +874,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEES ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1118,7 +1118,7 @@ or GE matrices */ } work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGEES */ diff --git a/lapack-netlib/SRC/dgees.f b/lapack-netlib/SRC/dgees.f index 82b9d6ee44..24739b1cf7 100644 --- a/lapack-netlib/SRC/dgees.f +++ b/lapack-netlib/SRC/dgees.f @@ -302,7 +302,7 @@ SUBROUTINE DGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, WR, WI, * CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/dgeesx.c b/lapack-netlib/SRC/dgeesx.c index 3c6b2792c3..1394a79530 100644 --- a/lapack-netlib/SRC/dgeesx.c +++ b/lapack-netlib/SRC/dgeesx.c @@ -793,7 +793,7 @@ f"> */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgeesx_(char *jobvs, char *sort, L_fp select, char * +/* Subroutine */ void dgeesx_(char *jobvs, char *sort, L_fp select, char * sense, integer *n, doublereal *a, integer *lda, integer *sdim, doublereal *wr, doublereal *wi, doublereal *vs, integer *ldvs, doublereal *rconde, doublereal *rcondv, doublereal *work, integer * @@ -807,12 +807,12 @@ f"> */ doublereal anrm; integer ierr, itau, iwrk, lwrk, inxt, i__, icond, ieval; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical cursl; integer liwrk, i1, i2; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dgebak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dgebak_( char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dgebal_(char *, integer *, doublereal *, integer *, integer *, @@ -822,23 +822,23 @@ f"> */ doublereal cscale; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dgehrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; - extern /* Subroutine */ int dorghr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorghr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dhseqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical wantsb; - extern /* Subroutine */ int dtrsen_(char *, char *, logical *, integer *, + extern /* Subroutine */ void dtrsen_(char *, char *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *); @@ -966,16 +966,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1240,7 +1240,7 @@ f"> */ iwork[1] = 1; } - return 0; + return; /* End of DGEESX */ diff --git a/lapack-netlib/SRC/dgeesx.f b/lapack-netlib/SRC/dgeesx.f index 08fbb6468e..f3677fcb30 100644 --- a/lapack-netlib/SRC/dgeesx.f +++ b/lapack-netlib/SRC/dgeesx.f @@ -382,7 +382,7 @@ SUBROUTINE DGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, * CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/dgeev.c b/lapack-netlib/SRC/dgeev.c index 33f472a493..5afca3c424 100644 --- a/lapack-netlib/SRC/dgeev.c +++ b/lapack-netlib/SRC/dgeev.c @@ -706,7 +706,7 @@ ices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgeev_(char *jobvl, char *jobvr, integer *n, doublereal * +/* Subroutine */ void dgeev_(char *jobvl, char *jobvr, integer *n, doublereal * a, integer *lda, doublereal *wr, doublereal *wi, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, doublereal *work, integer *lwork, integer *info) @@ -721,17 +721,17 @@ ices */ char side[1]; doublereal anrm; integer ierr, itau; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer iwrk, nout; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, k; doublereal r__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); extern doublereal dlapy2_(doublereal *, doublereal *); - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dgebak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dgebak_( char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dgebal_(char *, integer *, doublereal *, integer *, integer *, @@ -742,23 +742,24 @@ ices */ doublereal cscale; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dgehrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); logical select[1]; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; - extern /* Subroutine */ int dorghr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorghr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dhseqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -768,7 +769,7 @@ ices */ doublereal smlnum; integer hswork; logical lquery, wantvr; - extern /* Subroutine */ int dtrevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void dtrevc3_(char *, char *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); @@ -910,15 +911,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1155,7 +1156,7 @@ ices */ } work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGEEV */ diff --git a/lapack-netlib/SRC/dgeevx.c b/lapack-netlib/SRC/dgeevx.c index 38408cd68f..03010fbb6d 100644 --- a/lapack-netlib/SRC/dgeevx.c +++ b/lapack-netlib/SRC/dgeevx.c @@ -819,7 +819,7 @@ f"> */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgeevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void dgeevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, doublereal *a, integer *lda, doublereal *wr, doublereal *wi, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, integer *ilo, integer *ihi, doublereal *scale, @@ -835,18 +835,18 @@ f"> */ char side[1]; doublereal anrm; integer ierr, itau; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer iwrk, nout; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, k; doublereal r__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer icond; extern logical lsame_(char *, char *); extern doublereal dlapy2_(doublereal *, doublereal *); - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dgebak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dgebak_( char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dgebal_(char *, integer *, doublereal *, integer *, integer *, @@ -857,23 +857,24 @@ f"> */ doublereal cscale; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dgehrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); logical select[1]; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; - extern /* Subroutine */ int dorghr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorghr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dhseqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -887,7 +888,7 @@ f"> */ logical wntsne; doublereal smlnum; logical lquery, wantvr, wntsnn, wntsnv; - extern /* Subroutine */ int dtrevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void dtrevc3_(char *, char *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); @@ -1051,15 +1052,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1322,7 +1323,7 @@ f"> */ } work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGEEVX */ diff --git a/lapack-netlib/SRC/dgehd2.c b/lapack-netlib/SRC/dgehd2.c index 7e1f0cd1b2..9f1f39f56f 100644 --- a/lapack-netlib/SRC/dgehd2.c +++ b/lapack-netlib/SRC/dgehd2.c @@ -663,7 +663,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgehd2_(integer *n, integer *ilo, integer *ihi, +/* Subroutine */ void dgehd2_(integer *n, integer *ilo, integer *ihi, doublereal *a, integer *lda, doublereal *tau, doublereal *work, integer *info) { @@ -672,10 +672,11 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; @@ -711,7 +712,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEHD2", &i__1, (ftnlen)6); - return 0; + return; } i__1 = *ihi - 1; @@ -744,7 +745,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of DGEHD2 */ diff --git a/lapack-netlib/SRC/dgehrd.c b/lapack-netlib/SRC/dgehrd.c index 3408b20b44..9944187fe3 100644 --- a/lapack-netlib/SRC/dgehrd.c +++ b/lapack-netlib/SRC/dgehrd.c @@ -686,7 +686,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgehrd_(integer *n, integer *ilo, integer *ihi, +/* Subroutine */ void dgehrd_(integer *n, integer *ilo, integer *ihi, doublereal *a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -695,11 +695,11 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nbmin, iinfo; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), daxpy_( integer *, doublereal *, doublereal *, integer *, doublereal *, @@ -710,7 +710,7 @@ f"> */ integer ib; doublereal ei; integer nb, nh; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -771,9 +771,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEHRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ @@ -794,7 +794,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1] = 1.; - return 0; + return; } /* Determine the block size */ @@ -905,7 +905,7 @@ f"> */ dgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo); work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGEHRD */ diff --git a/lapack-netlib/SRC/dgejsv.c b/lapack-netlib/SRC/dgejsv.c index 4508d9003c..73b43e471d 100644 --- a/lapack-netlib/SRC/dgejsv.c +++ b/lapack-netlib/SRC/dgejsv.c @@ -991,7 +991,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgejsv_(char *joba, char *jobu, char *jobv, char *jobr, +/* Subroutine */ void dgejsv_(char *joba, char *jobu, char *jobv, char *jobr, char *jobt, char *jobp, integer *m, integer *n, doublereal *a, integer *lda, doublereal *sva, doublereal *u, integer *ldu, doublereal *v, integer *ldv, doublereal *work, integer *lwork, @@ -1011,63 +1011,65 @@ f"> */ doublereal temp1; integer p, q; logical jracc; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal small, entra, sfmin; logical lsvec; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal epsln; logical rsvec; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer n1; logical l2aber; - extern /* Subroutine */ int dgeqp3_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqp3_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal condr1, condr2, uscal1, uscal2; logical l2kill, l2rank, l2tran, l2pert; extern doublereal dlamch_(char *); integer nr; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); doublereal scalem; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal sconda; logical goscal; doublereal aatmin; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal aatmax; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical noscal; - extern /* Subroutine */ int dpocon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dpocon_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dgesvj_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlassq_(integer *, doublereal *, integer - *, doublereal *, doublereal *), dlaswp_(integer *, doublereal *, + *, doublereal *, doublereal *); + extern int dlaswp_(integer *, doublereal *, integer *, integer *, integer *, integer *, integer *); doublereal entrat; logical almort; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal maxprj; logical errest; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical transp, rowpiv; @@ -1173,7 +1175,7 @@ f"> */ /* #:( */ i__1 = -(*info); xerbla_("DGEJSV", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return for void matrix (Y3K safe) */ @@ -1189,7 +1191,7 @@ f"> */ work[5] = 0.; work[6] = 0.; work[7] = 0.; - return 0; + return; } /* Determine whether the matrix U should be M x N or M x M */ @@ -1229,7 +1231,7 @@ f"> */ *info = -9; i__2 = -(*info); xerbla_("DGEJSV", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscal) { @@ -1291,7 +1293,7 @@ f"> */ iwork[1] = 0; iwork[2] = 0; iwork[3] = 0; - return 0; + return; } /* Issue warning if denormalized column norms detected. Override the */ @@ -1356,7 +1358,7 @@ f"> */ work[6] = 0.; work[7] = 0.; } - return 0; + return; } @@ -2801,6 +2803,6 @@ f"> */ iwork[2] = numrank; iwork[3] = warning; - return 0; + return; } /* dgejsv_ */ diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f index fc13f4a5fb..83d16c30e1 100644 --- a/lapack-netlib/SRC/dgejsv.f +++ b/lapack-netlib/SRC/dgejsv.f @@ -224,7 +224,7 @@ *> *> \param[out] U *> \verbatim -*> U is DOUBLE PRECISION array, dimension ( LDU, N ) +*> U is DOUBLE PRECISION array, dimension ( LDU, N ) or ( LDU, M ) *> If JOBU = 'U', then U contains on exit the M-by-N matrix of *> the left singular vectors. *> If JOBU = 'F', then U contains on exit the M-by-M matrix of @@ -271,7 +271,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is DOUBLE PRECISION array, dimension (LWORK) +*> WORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK)) *> On exit, if N > 0 .AND. M > 0 (else not referenced), *> WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such *> that SCALE*SVA(1:N) are the computed singular values diff --git a/lapack-netlib/SRC/dgelq.c b/lapack-netlib/SRC/dgelq.c index c7b981ada8..013cc57b2b 100644 --- a/lapack-netlib/SRC/dgelq.c +++ b/lapack-netlib/SRC/dgelq.c @@ -681,7 +681,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgelq_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgelq_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *t, integer *tsize, doublereal *work, integer *lwork, integer *info) { @@ -694,12 +694,12 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dgelqt_(integer *, integer *, integer *, + extern /* Subroutine */ void dgelqt_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int dlaswlq_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaswlq_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -839,15 +839,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("DGELQ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -861,7 +861,7 @@ static integer c__2 = 2; work[1] = (doublereal) lwreq; - return 0; + return; /* End of DGELQ */ diff --git a/lapack-netlib/SRC/dgelq2.c b/lapack-netlib/SRC/dgelq2.c index c42cef2fb8..9d76a14c0e 100644 --- a/lapack-netlib/SRC/dgelq2.c +++ b/lapack-netlib/SRC/dgelq2.c @@ -639,7 +639,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgelq2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgelq2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -647,10 +647,11 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; @@ -684,7 +685,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -713,7 +714,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of DGELQ2 */ diff --git a/lapack-netlib/SRC/dgelqf.c b/lapack-netlib/SRC/dgelqf.c index a10cfadb70..723008a899 100644 --- a/lapack-netlib/SRC/dgelqf.c +++ b/lapack-netlib/SRC/dgelqf.c @@ -659,7 +659,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgelqf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgelqf_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -667,16 +667,17 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int dgelq2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelq2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -721,9 +722,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -731,7 +732,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -816,7 +817,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DGELQF */ diff --git a/lapack-netlib/SRC/dgelqt.c b/lapack-netlib/SRC/dgelqt.c index 7d387f6aaf..358b4dd311 100644 --- a/lapack-netlib/SRC/dgelqt.c +++ b/lapack-netlib/SRC/dgelqt.c @@ -648,7 +648,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgelqt_(integer *m, integer *n, integer *mb, doublereal * +/* Subroutine */ void dgelqt_(integer *m, integer *n, integer *mb, doublereal * a, integer *lda, doublereal *t, integer *ldt, doublereal *work, integer *info) { @@ -657,11 +657,12 @@ f"> */ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, - integer *), xerbla_(char *, - integer *, ftnlen), dgelqt3_(integer *, integer *, doublereal *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dgelqt3_(integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -701,14 +702,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -737,7 +738,7 @@ f"> */ i__ * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of DGELQT */ diff --git a/lapack-netlib/SRC/dgelqt3.c b/lapack-netlib/SRC/dgelqt3.c index ce8dec2c9f..a16f2568f1 100644 --- a/lapack-netlib/SRC/dgelqt3.c +++ b/lapack-netlib/SRC/dgelqt3.c @@ -646,7 +646,7 @@ ompact WY representation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgelqt3_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgelqt3_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *t, integer *ldt, integer *info) { /* System generated locals */ @@ -654,16 +654,17 @@ ompact WY representation of Q. */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i1, j1, m1, m2; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, - integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, + integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.8.0) -- */ @@ -697,7 +698,7 @@ ompact WY representation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELQT3", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 1) { @@ -795,7 +796,7 @@ ompact WY representation of Q. */ } - return 0; + return; /* End of DGELQT3 */ diff --git a/lapack-netlib/SRC/dgels.c b/lapack-netlib/SRC/dgels.c index 09efbc4021..4ee0785f95 100644 --- a/lapack-netlib/SRC/dgels.c +++ b/lapack-netlib/SRC/dgels.c @@ -698,7 +698,7 @@ static integer c__0 = 0; /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dgels_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void dgels_(char *trans, integer *m, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *work, integer *lwork, integer *info) { @@ -713,24 +713,25 @@ static integer c__0 = 0; extern logical lsame_(char *, char *); integer wsize; doublereal rwork[1]; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer nb; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer mn; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer scllen; doublereal bignum; - extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormqr_(char *, char *, integer *, integer *, integer *, @@ -840,9 +841,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("DGELS ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -852,7 +853,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); dlaset_("Full", &i__1, nrhs, &c_b33, &c_b33, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -938,7 +939,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; @@ -953,7 +954,7 @@ static integer c__0 = 0; lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = ZERO */ @@ -1000,7 +1001,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1043,7 +1044,7 @@ static integer c__0 = 0; lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1072,7 +1073,7 @@ static integer c__0 = 0; L50: work[1] = (doublereal) wsize; - return 0; + return; /* End of DGELS */ diff --git a/lapack-netlib/SRC/dgelsd.c b/lapack-netlib/SRC/dgelsd.c index d9a932f246..5b1694bac5 100644 --- a/lapack-netlib/SRC/dgelsd.c +++ b/lapack-netlib/SRC/dgelsd.c @@ -727,7 +727,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int dgelsd_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void dgelsd_(integer *m, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * s, doublereal *rcond, integer *rank, doublereal *work, integer *lwork, integer *iwork, integer *info) @@ -740,16 +740,16 @@ f"> */ integer itau, nlvl, iascl, ibscl; doublereal sfmin; integer minmn, maxmn, itaup, itauq, mnthr, nwork; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer ie, il; - extern /* Subroutine */ int dgebrd_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgebrd_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer mm; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlalsd_(char *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, @@ -759,20 +759,20 @@ f"> */ integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; - extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormbr_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer wlalsd; - extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer ldwork; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer liwork, minwrk, maxwrk; @@ -972,7 +972,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { goto L10; } @@ -981,7 +981,7 @@ f"> */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters. */ @@ -1269,7 +1269,7 @@ f"> */ L10: work[1] = (doublereal) maxwrk; iwork[1] = liwork; - return 0; + return; /* End of DGELSD */ diff --git a/lapack-netlib/SRC/dgelss.c b/lapack-netlib/SRC/dgelss.c index cfd7bf8abe..e2168b2a66 100644 --- a/lapack-netlib/SRC/dgelss.c +++ b/lapack-netlib/SRC/dgelss.c @@ -689,7 +689,7 @@ f"> */ /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dgelss_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void dgelss_(integer *m, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * s, doublereal *rcond, integer *rank, doublereal *work, integer *lwork, integer *info) @@ -703,23 +703,23 @@ f"> */ integer itau, lwork_dgebrd__, lwork_dgelqf__, lwork_dgeqrf__, lwork_dorgbr__, lwork_dormbr__, i__, lwork_dormlq__, lwork_dormqr__; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iascl, ibscl; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), drscl_(integer *, doublereal *, doublereal *, integer *); integer chunk; doublereal sfmin; integer minmn; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer maxmn, itaup, itauq, mnthr, iwork; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer bl, ie, il; - extern /* Subroutine */ int dgebrd_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgebrd_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); @@ -727,7 +727,7 @@ f"> */ extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer bdspac; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -735,8 +735,9 @@ f"> */ doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dbdsqr_(char *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dbdsqr_(char *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dorgbr_(char *, @@ -745,13 +746,13 @@ f"> */ doublereal bignum; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormbr_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer ldwork; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer minwrk, maxwrk; @@ -978,16 +979,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELSS", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1434,7 +1435,7 @@ f"> */ L70: work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGELSS */ diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f index 8ed703fcf2..c4190f2e09 100644 --- a/lapack-netlib/SRC/dgelss.f +++ b/lapack-netlib/SRC/dgelss.f @@ -254,11 +254,11 @@ SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * * Compute space needed for DGEQRF CALL DGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_DGEQRF=DUM(1) + LWORK_DGEQRF = INT( DUM(1) ) * Compute space needed for DORMQR CALL DORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_DORMQR=DUM(1) + LWORK_DORMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + LWORK_DGEQRF ) MAXWRK = MAX( MAXWRK, N + LWORK_DORMQR ) @@ -273,15 +273,15 @@ SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for DGEBRD CALL DGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 3*N + LWORK_DGEBRD ) MAXWRK = MAX( MAXWRK, 3*N + LWORK_DORMBR ) @@ -305,23 +305,23 @@ SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for DGELQF CALL DGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_DGELQF=DUM(1) + LWORK_DGELQF = INT( DUM(1) ) * Compute space needed for DGEBRD CALL DGEBRD( M, M, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) * Compute space needed for DORMLQ CALL DORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_DORMLQ=DUM(1) + LWORK_DORMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_DGELQF MAXWRK = MAX( MAXWRK, M*M + 4*M + LWORK_DGEBRD ) @@ -341,15 +341,15 @@ SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for DGEBRD CALL DGEBRD( M, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_DGEBRD=DUM(1) + LWORK_DGEBRD = INT( DUM(1) ) * Compute space needed for DORMBR CALL DORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_DORMBR=DUM(1) + LWORK_DORMBR = INT( DUM(1) ) * Compute space needed for DORGBR CALL DORGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_DORGBR=DUM(1) + LWORK_DORGBR = INT( DUM(1) ) MAXWRK = 3*M + LWORK_DGEBRD MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORMBR ) MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORGBR ) diff --git a/lapack-netlib/SRC/dgelst.c b/lapack-netlib/SRC/dgelst.c new file mode 100644 index 0000000000..9333bd5dd0 --- /dev/null +++ b/lapack-netlib/SRC/dgelst.c @@ -0,0 +1,1105 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download DGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */ +/* > factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'T' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'T' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'T': the linear system involves A**T. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by DGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by DGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'T'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ void dgelst_(char *trans, integer *m, integer *n, integer * + nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; + + /* Local variables */ + doublereal anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + doublereal rwork[1]; + integer lwopt; + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); + integer nb; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer mn; + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *), dlaset_(char *, integer *, integer + *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + doublereal bignum; + extern /* Subroutine */ void dgelqt_(integer *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dgeqrt_(integer *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *, doublereal *, integer *); + integer mnnrhs; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + integer *); + extern void dgemlqt_(char *, char *, + integer *, integer *, integer *, integer *, doublereal *, integer + *, doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *), dgemqrt_(char *, char *, integer *, + integer *, integer *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *, doublereal *, + integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "T"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "DGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + work[1] = (doublereal) lwopt; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("DGELST ", &i__1, 6); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (doublereal) lwopt; + return; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "DGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = dlamch_("S") / dlamch_("P"); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = dlange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0. && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (doublereal) lwopt; + return; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = dlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0. && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + dlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + dlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + dgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + dtrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + dtrtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + dgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + dtrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + dgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + dtrtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + dlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + dlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + dlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + dlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + work[1] = (doublereal) lwopt; + + return; + +/* End of DGELST */ + +} /* dgelst_ */ + diff --git a/lapack-netlib/SRC/dgelst.f b/lapack-netlib/SRC/dgelst.f new file mode 100644 index 0000000000..ca0e04a9b8 --- /dev/null +++ b/lapack-netlib/SRC/dgelst.f @@ -0,0 +1,531 @@ +*> \brief DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its transpose, using a QR or LQ +*> factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'T' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'T' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'T': the linear system involves A**T. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by DGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by DGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'T'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL LSAME, ILAENV, DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DGELQT, DGEQRT, DGEMLQT, DGEMQRT, DLABAD, + $ DLASCL, DLASET, DTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'DGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = DBLE( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'DGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL DLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = DLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = DLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL DLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL DLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL DGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ), + $ INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL DTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL DTRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL DGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL DTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL DGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL DTRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL DLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL DLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = DBLE( LWOPT ) +* + RETURN +* +* End of DGELST +* + END diff --git a/lapack-netlib/SRC/dgelsy.c b/lapack-netlib/SRC/dgelsy.c index 2bfc6d098b..5c625f939c 100644 --- a/lapack-netlib/SRC/dgelsy.c +++ b/lapack-netlib/SRC/dgelsy.c @@ -721,7 +721,7 @@ f"> */ /* > G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgelsy_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void dgelsy_(integer *m, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * jpvt, doublereal *rcond, integer *rank, doublereal *work, integer * lwork, integer *info) @@ -733,42 +733,42 @@ f"> */ /* Local variables */ doublereal anrm, bnrm, smin, smax; integer i__, j, iascl, ibscl; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer ismin, ismax; doublereal c1, c2; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dlaic1_( integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal wsize, s1, s2; - extern /* Subroutine */ int dgeqp3_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqp3_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, integer *), dlabad_(doublereal *, doublereal *); integer nb; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer mn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer - *, doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum; integer lwkmin, nb1, nb2, nb3, nb4; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal sminpr, smaxpr, smlnum; - extern /* Subroutine */ int dormrz_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormrz_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int dtzrzf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dtzrzf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -853,16 +853,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGELSY", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (mn == 0 || *nrhs == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1061,7 +1061,7 @@ f"> */ L70: work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGELSY */ diff --git a/lapack-netlib/SRC/dgemlq.c b/lapack-netlib/SRC/dgemlq.c index bf62e3117a..acc56173ac 100644 --- a/lapack-netlib/SRC/dgemlq.c +++ b/lapack-netlib/SRC/dgemlq.c @@ -674,7 +674,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgemlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dgemlq_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *t, integer * tsize, doublereal *c__, integer *ldc, doublereal *work, integer * lwork, integer *info) @@ -684,7 +684,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int dlamswlq_(char *, char *, integer *, integer * + extern /* Subroutine */ void dlamswlq_(char *, char *, integer *, integer * , integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -693,7 +693,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int dgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemlqt_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -774,9 +774,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGEMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -784,7 +784,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -800,7 +800,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1] = (doublereal) lw; - return 0; + return; /* End of DGEMLQ */ diff --git a/lapack-netlib/SRC/dgemlqt.c b/lapack-netlib/SRC/dgemlqt.c index 6f74701496..e188d3532e 100644 --- a/lapack-netlib/SRC/dgemlqt.c +++ b/lapack-netlib/SRC/dgemlqt.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgemlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dgemlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *mb, doublereal *v, integer *ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *info) @@ -691,11 +691,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, - integer *), xerbla_(char *, - integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; @@ -757,12 +757,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGEMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -823,7 +823,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DGEMLQT */ diff --git a/lapack-netlib/SRC/dgemqr.c b/lapack-netlib/SRC/dgemqr.c index 84c478c033..a309b328b2 100644 --- a/lapack-netlib/SRC/dgemqr.c +++ b/lapack-netlib/SRC/dgemqr.c @@ -675,7 +675,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgemqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dgemqr_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *t, integer * tsize, doublereal *c__, integer *ldc, doublereal *work, integer * lwork, integer *info) @@ -685,7 +685,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int dlamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void dlamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -694,7 +694,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int dgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemqrt_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -775,9 +775,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGEMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -785,7 +785,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -801,7 +801,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1] = (doublereal) lw; - return 0; + return; /* End of DGEMQR */ diff --git a/lapack-netlib/SRC/dgemqrt.c b/lapack-netlib/SRC/dgemqrt.c index 47af19b672..f8f9393852 100644 --- a/lapack-netlib/SRC/dgemqrt.c +++ b/lapack-netlib/SRC/dgemqrt.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgemqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dgemqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *nb, doublereal *v, integer *ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *info) @@ -691,10 +691,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; @@ -758,12 +759,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGEMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -824,7 +825,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DGEMQRT */ diff --git a/lapack-netlib/SRC/dgeql2.c b/lapack-netlib/SRC/dgeql2.c index aa1747386c..125ede3602 100644 --- a/lapack-netlib/SRC/dgeql2.c +++ b/lapack-netlib/SRC/dgeql2.c @@ -637,7 +637,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeql2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeql2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -645,10 +645,11 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; @@ -682,7 +683,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQL2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -707,7 +708,7 @@ f"> */ a[*m - k + i__ + (*n - k + i__) * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of DGEQL2 */ diff --git a/lapack-netlib/SRC/dgeqlf.c b/lapack-netlib/SRC/dgeqlf.c index b2a00a7521..94853a678c 100644 --- a/lapack-netlib/SRC/dgeqlf.c +++ b/lapack-netlib/SRC/dgeqlf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqlf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqlf_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -662,16 +662,17 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int dgeql2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeql2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer mu, nu, nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -727,15 +728,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQLF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -827,7 +828,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DGEQLF */ diff --git a/lapack-netlib/SRC/dgeqp3.c b/lapack-netlib/SRC/dgeqp3.c index 37a6c00883..dd966a50e8 100644 --- a/lapack-netlib/SRC/dgeqp3.c +++ b/lapack-netlib/SRC/dgeqp3.c @@ -667,7 +667,7 @@ f"> */ /* > X. Sun, Computer Science Dept., Duke University, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqp3_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqp3_(integer *m, integer *n, doublereal *a, integer * lda, integer *jpvt, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -678,24 +678,24 @@ f"> */ integer nfxd; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer j, nbmin, minmn; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer minws; - extern /* Subroutine */ int dlaqp2_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaqp2_(integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *); integer jb, na, nb, sm, sn, nx; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlaqps_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaqps_(integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); integer topbmn, sminmn; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; @@ -755,9 +755,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQP3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Move initial columns up front. */ @@ -911,7 +911,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DGEQP3 */ diff --git a/lapack-netlib/SRC/dgeqr.c b/lapack-netlib/SRC/dgeqr.c index fe3e1f82f1..dbd0c7e4f8 100644 --- a/lapack-netlib/SRC/dgeqr.c +++ b/lapack-netlib/SRC/dgeqr.c @@ -683,7 +683,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqr_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqr_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *t, integer *tsize, doublereal *work, integer *lwork, integer *info) { @@ -696,12 +696,12 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dgeqrt_(integer *, integer *, integer *, + extern /* Subroutine */ void dgeqrt_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int dlatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dlatsqr_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -827,15 +827,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("DGEQR", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -851,7 +851,7 @@ static integer c__2 = 2; i__1 = 1, i__2 = nb * *n; work[1] = (doublereal) f2cmax(i__1,i__2); - return 0; + return; /* End of DGEQR */ diff --git a/lapack-netlib/SRC/dgeqr2.c b/lapack-netlib/SRC/dgeqr2.c index b19eb347ae..f80cd2da83 100644 --- a/lapack-netlib/SRC/dgeqr2.c +++ b/lapack-netlib/SRC/dgeqr2.c @@ -644,7 +644,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqr2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqr2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -652,10 +652,11 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; @@ -689,7 +690,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQR2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -718,7 +719,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of DGEQR2 */ diff --git a/lapack-netlib/SRC/dgeqr2p.c b/lapack-netlib/SRC/dgeqr2p.c index d97cc42dfc..83f21e8789 100644 --- a/lapack-netlib/SRC/dgeqr2p.c +++ b/lapack-netlib/SRC/dgeqr2p.c @@ -648,7 +648,7 @@ l elements using an unblocked algorithm. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqr2p_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqr2p_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -656,11 +656,12 @@ l elements using an unblocked algorithm. */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; - extern /* Subroutine */ int dlarfgp_(integer *, doublereal *, doublereal * + extern /* Subroutine */ void dlarfgp_(integer *, doublereal *, doublereal * , integer *, doublereal *); @@ -694,7 +695,7 @@ l elements using an unblocked algorithm. */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQR2P", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -723,7 +724,7 @@ l elements using an unblocked algorithm. */ } /* L10: */ } - return 0; + return; /* End of DGEQR2P */ diff --git a/lapack-netlib/SRC/dgeqrf.c b/lapack-netlib/SRC/dgeqrf.c index ab922cf7fd..fc0bb6989f 100644 --- a/lapack-netlib/SRC/dgeqrf.c +++ b/lapack-netlib/SRC/dgeqrf.c @@ -661,7 +661,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqrf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqrf_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -669,16 +669,17 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int dgeqr2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqr2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -723,9 +724,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -733,7 +734,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -818,7 +819,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DGEQRF */ diff --git a/lapack-netlib/SRC/dgeqrfp.c b/lapack-netlib/SRC/dgeqrfp.c index 6abbdeb173..5de166b5c9 100644 --- a/lapack-netlib/SRC/dgeqrfp.c +++ b/lapack-netlib/SRC/dgeqrfp.c @@ -665,7 +665,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqrfp_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqrfp_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -673,18 +673,19 @@ static integer c__2 = 2; /* Local variables */ integer i__, k, nbmin, iinfo, ib, nb; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; logical lquery; - extern /* Subroutine */ int dgeqr2p_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqr2p_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iws; @@ -726,9 +727,9 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("DGEQRFP", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -736,7 +737,7 @@ static integer c__2 = 2; k = f2cmin(*m,*n); if (k == 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -821,7 +822,7 @@ static integer c__2 = 2; } work[1] = (doublereal) iws; - return 0; + return; /* End of DGEQRFP */ diff --git a/lapack-netlib/SRC/dgeqrt.c b/lapack-netlib/SRC/dgeqrt.c index 49b5b4e0d2..2c0d8c12eb 100644 --- a/lapack-netlib/SRC/dgeqrt.c +++ b/lapack-netlib/SRC/dgeqrt.c @@ -650,7 +650,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqrt_(integer *m, integer *n, integer *nb, doublereal * +/* Subroutine */ void dgeqrt_(integer *m, integer *n, integer *nb, doublereal * a, integer *lda, doublereal *t, integer *ldt, doublereal *work, integer *info) { @@ -659,11 +659,12 @@ f"> */ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, - integer *), xerbla_(char *, - integer *, ftnlen), dgeqrt2_(integer *, integer *, doublereal *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dgeqrt2_(integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dgeqrt3_(integer * , integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -705,14 +706,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -747,7 +748,7 @@ f"> */ ib) * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of DGEQRT */ diff --git a/lapack-netlib/SRC/dgeqrt2.c b/lapack-netlib/SRC/dgeqrt2.c index 1233bc9cb2..84a9873879 100644 --- a/lapack-netlib/SRC/dgeqrt2.c +++ b/lapack-netlib/SRC/dgeqrt2.c @@ -643,25 +643,25 @@ presentation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqrt2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqrt2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *t, integer *ldt, integer *info) { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, k; doublereal alpha; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dlarfg_(integer *, doublereal - *, doublereal *, integer *, doublereal *), xerbla_(char *, - integer *, ftnlen); + *, doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; @@ -698,7 +698,7 @@ presentation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQRT2", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -767,6 +767,6 @@ presentation of Q. */ /* End of DGEQRT2 */ - return 0; + return; } /* dgeqrt2_ */ diff --git a/lapack-netlib/SRC/dgeqrt3.c b/lapack-netlib/SRC/dgeqrt3.c index 777fd08555..997a8e1d1c 100644 --- a/lapack-netlib/SRC/dgeqrt3.c +++ b/lapack-netlib/SRC/dgeqrt3.c @@ -648,7 +648,7 @@ ompact WY representation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgeqrt3_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgeqrt3_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *t, integer *ldt, integer *info) { /* System generated locals */ @@ -656,16 +656,17 @@ ompact WY representation of Q. */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i1, j1, n1, n2; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, - integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, + integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -699,7 +700,7 @@ ompact WY representation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("DGEQRT3", &i__1, (ftnlen)7); - return 0; + return; } if (*n == 1) { @@ -797,7 +798,7 @@ ompact WY representation of Q. */ } - return 0; + return; /* End of DGEQRT3 */ diff --git a/lapack-netlib/SRC/dgerfs.c b/lapack-netlib/SRC/dgerfs.c index 159da155c6..a332ec33d2 100644 --- a/lapack-netlib/SRC/dgerfs.c +++ b/lapack-netlib/SRC/dgerfs.c @@ -699,7 +699,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgerfs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void dgerfs_(char *trans, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer * ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, @@ -716,21 +716,22 @@ f"> */ integer i__, j, k; doublereal s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); doublereal xk; integer nz; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dgetrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dgetrs_( char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical notran; @@ -790,7 +791,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGERFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -802,7 +803,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -998,7 +999,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DGERFS */ diff --git a/lapack-netlib/SRC/dgerfsx.c b/lapack-netlib/SRC/dgerfsx.c index 5e7f697a46..e003bfaa88 100644 --- a/lapack-netlib/SRC/dgerfsx.c +++ b/lapack-netlib/SRC/dgerfsx.c @@ -926,7 +926,7 @@ static integer c__1 = 1; /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgerfsx_(char *trans, char *equed, integer *n, integer * +/* Subroutine */ void dgerfsx_(char *trans, char *equed, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, doublereal *r__, doublereal *c__, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *rcond, @@ -952,9 +952,10 @@ static integer c__1 = 1; integer prec_type__; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgecon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dgecon_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); logical colequ, notran, rowequ; integer trans_type__; extern integer ilaprec_(char *); @@ -963,7 +964,7 @@ static integer c__1 = 1; integer *, doublereal *, integer *); integer ithresh, n_norms__; doublereal rthresh, cwise_wrong__; - extern /* Subroutine */ int dla_gerfsx_extended_(integer *, integer *, + extern /* Subroutine */ void dla_gerfsx_extended_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, logical *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, @@ -1082,7 +1083,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DGERFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -1105,7 +1106,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.; } } - return 0; + return; } /* Default to failure. */ @@ -1264,7 +1265,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of DGERFSX */ diff --git a/lapack-netlib/SRC/dgerq2.c b/lapack-netlib/SRC/dgerq2.c index 2a1a70cefd..7778473846 100644 --- a/lapack-netlib/SRC/dgerq2.c +++ b/lapack-netlib/SRC/dgerq2.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgerq2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgerq2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -641,10 +641,11 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); doublereal aii; @@ -678,7 +679,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGERQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -703,7 +704,7 @@ f"> */ a[*m - k + i__ + (*n - k + i__) * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of DGERQ2 */ diff --git a/lapack-netlib/SRC/dgerqf.c b/lapack-netlib/SRC/dgerqf.c index 8882ba184b..1eb54dc840 100644 --- a/lapack-netlib/SRC/dgerqf.c +++ b/lapack-netlib/SRC/dgerqf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgerqf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgerqf_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -662,16 +662,17 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int dgerq2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgerq2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer mu, nu, nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -727,15 +728,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGERQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -826,7 +827,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DGERQF */ diff --git a/lapack-netlib/SRC/dgesc2.c b/lapack-netlib/SRC/dgesc2.c index e53e4f3773..3e7288bc6a 100644 --- a/lapack-netlib/SRC/dgesc2.c +++ b/lapack-netlib/SRC/dgesc2.c @@ -629,7 +629,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int dgesc2_(integer *n, doublereal *a, integer *lda, +/* Subroutine */ void dgesc2_(integer *n, doublereal *a, integer *lda, doublereal *rhs, integer *ipiv, integer *jpiv, doublereal *scale) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Local variables */ doublereal temp; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); extern integer idamax_(integer *, doublereal *, integer *); @@ -720,7 +720,7 @@ f"> */ i__1 = *n - 1; dlaswp_(&c__1, &rhs[1], lda, &c__1, &i__1, &jpiv[1], &c_n1); - return 0; + return; /* End of DGESC2 */ diff --git a/lapack-netlib/SRC/dgesdd.c b/lapack-netlib/SRC/dgesdd.c index 21c1338628..a84ed5006c 100644 --- a/lapack-netlib/SRC/dgesdd.c +++ b/lapack-netlib/SRC/dgesdd.c @@ -734,7 +734,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgesdd_(char *jobz, integer *m, integer *n, doublereal * +/* Subroutine */ void dgesdd_(char *jobz, integer *m, integer *n, doublereal * a, integer *lda, doublereal *s, doublereal *u, integer *ldu, doublereal *vt, integer *ldvt, doublereal *work, integer *lwork, integer *iwork, integer *info) @@ -750,7 +750,7 @@ f"> */ integer idum[1], ierr, itau, lwork_dormbr_qln_mm__, lwork_dormbr_qln_mn__, lwork_dormbr_qln_nn__, lwork_dormbr_prt_mm__, lwork_dormbr_prt_mn__, lwork_dormbr_prt_nn__, i__; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); @@ -759,11 +759,11 @@ f"> */ integer nwork; logical wntqn, wntqo, wntqs; integer ie, lwork_dorgbr_p_mm__; - extern /* Subroutine */ int dbdsdc_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dbdsdc_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer il, lwork_dorgbr_q_nn__; - extern /* Subroutine */ int dgebrd_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgebrd_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); @@ -771,7 +771,7 @@ f"> */ extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer iu; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -779,13 +779,14 @@ f"> */ doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dorgbr_(char *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dorgbr_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); extern logical disnan_(doublereal *); doublereal bignum; - extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormbr_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dorglq_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, @@ -1238,15 +1239,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGESDD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1260,7 +1261,7 @@ f"> */ anrm = dlange_("M", m, n, &a[a_offset], lda, dum); if (disnan_(&anrm)) { *info = -4; - return 0; + return; } iscl = 0; if (anrm > 0. && anrm < smlnum) { @@ -2283,7 +2284,7 @@ f"> */ work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGESDD */ diff --git a/lapack-netlib/SRC/dgesv.c b/lapack-netlib/SRC/dgesv.c index 8af33e743a..ad0978250f 100644 --- a/lapack-netlib/SRC/dgesv.c +++ b/lapack-netlib/SRC/dgesv.c @@ -638,8 +638,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dgetrf_(integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen), dgetrs_(char *, integer *, integer *, doublereal *, + extern /* Subroutine */ void dgetrf_(integer *, integer *, doublereal *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dgetrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); diff --git a/lapack-netlib/SRC/dgesvd.c b/lapack-netlib/SRC/dgesvd.c index 98b1e22799..32f136b7dc 100644 --- a/lapack-netlib/SRC/dgesvd.c +++ b/lapack-netlib/SRC/dgesvd.c @@ -729,7 +729,7 @@ f"> */ /* > \ingroup doubleGEsing */ /* ===================================================================== */ -/* Subroutine */ int dgesvd_(char *jobu, char *jobvt, integer *m, integer *n, +/* Subroutine */ void dgesvd_(char *jobu, char *jobvt, integer *m, integer *n, doublereal *a, integer *lda, doublereal *s, doublereal *u, integer * ldu, doublereal *vt, integer *ldvt, doublereal *work, integer *lwork, integer *info) @@ -745,20 +745,20 @@ f"> */ doublereal anrm; integer ierr, itau, ncvt, nrvt, lwork_dgebrd__, lwork_dgelqf__, lwork_dgeqrf__, i__; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer chunk, minmn, wrkbl, itaup, itauq, mnthr, iwork; logical wntua, wntva, wntun, wntuo, wntvn, wntvo, wntus, wntvs; integer ie; - extern /* Subroutine */ int dgebrd_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgebrd_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer ir, bdspac, iu; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -776,7 +776,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormbr_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dorglq_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, @@ -1382,15 +1382,15 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("DGESVD", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -4591,7 +4591,7 @@ f"> */ work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGESVD */ diff --git a/lapack-netlib/SRC/dgesvdq.c b/lapack-netlib/SRC/dgesvdq.c index b69afddb4d..c7f2001c3a 100644 --- a/lapack-netlib/SRC/dgesvdq.c +++ b/lapack-netlib/SRC/dgesvdq.c @@ -932,7 +932,7 @@ static logical c_false = FALSE_; /* > \ingroup doubleGEsing */ /* ===================================================================== */ -/* Subroutine */ int dgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, +/* Subroutine */ void dgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, char *jobv, integer *m, integer *n, doublereal *a, integer *lda, doublereal *s, doublereal *u, integer *ldu, doublereal *v, integer * ldv, integer *numrank, integer *iwork, integer *liwork, doublereal * @@ -953,7 +953,7 @@ static logical c_false = FALSE_; logical acclh, acclm; integer p, q; logical conda; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iwoff; @@ -965,32 +965,35 @@ static logical c_false = FALSE_; logical dntwu, dntwv, wntua; integer lworq; logical wntuf, wntva, wntur, wntus, wntvr; - extern /* Subroutine */ int dgeqp3_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqp3_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, integer *); integer lwsvd2, lworq2; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer nr; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); doublereal sconda; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dgesvd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, - integer *, doublereal *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen), dlapmt_(logical *, + integer *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlapmt_(logical *, integer *, integer *, doublereal *, integer *, integer *), dpocon_(char *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, integer *, integer *), - dlaswp_(integer *, doublereal *, integer *, integer *, integer *, - integer *, integer *), dormlq_(char *, char *, integer *, integer + doublereal *, doublereal *, integer *, integer *); + extern int dlaswp_(integer *, doublereal *, integer *, integer *, integer *, + integer *, integer *); + extern void dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormqr_(char *, char *, integer *, integer *, integer *, @@ -1380,7 +1383,7 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); xerbla_("DGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { /* Return optimal workspace */ @@ -1389,13 +1392,13 @@ static logical c_false = FALSE_; work[1] = (doublereal) optwrk; work[2] = (doublereal) minwrk; rwork[1] = (doublereal) rminwrk; - return 0; + return; } /* Quick return if the matrix is void. */ if (*m == 0 || *n == 0) { - return 0; + return; } big = dlamch_("O"); @@ -1414,7 +1417,7 @@ static logical c_false = FALSE_; *info = -8; i__2 = -(*info); xerbla_("DGESVDQ", &i__2, (ftnlen)7); - return 0; + return; } /* L1904: */ } @@ -1465,7 +1468,7 @@ static logical c_false = FALSE_; rwork[1] = -1.; } rwork[2] = -1.; - return 0; + return; } if (rwork[1] > big / sqrt((doublereal) (*m))) { @@ -1489,7 +1492,7 @@ static logical c_false = FALSE_; *info = -8; i__1 = -(*info); xerbla_("DGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } if (rtmp > big / sqrt((doublereal) (*m))) { /* matrix by 1/sqrt(M) if too large entry detected */ @@ -2244,7 +2247,7 @@ static logical c_false = FALSE_; /* full row rank triangular (trapezoidal) factor of A. */ *numrank = nr; - return 0; + return; /* End of DGESVDQ */ diff --git a/lapack-netlib/SRC/dgesvdx.c b/lapack-netlib/SRC/dgesvdx.c index 5d0d79941d..3f3a8eb4fe 100644 --- a/lapack-netlib/SRC/dgesvdx.c +++ b/lapack-netlib/SRC/dgesvdx.c @@ -779,7 +779,7 @@ static doublereal c_b109 = 0.; /* > \ingroup doubleGEsing */ /* ===================================================================== */ -/* Subroutine */ int dgesvdx_(char *jobu, char *jobvt, char *range, integer * +/* Subroutine */ void dgesvdx_(char *jobu, char *jobvt, char *range, integer * m, integer *n, doublereal *a, integer *lda, doublereal *vl, doublereal *vu, integer *il, integer *iu, integer *ns, doublereal *s, doublereal *u, integer *ldu, doublereal *vt, integer *ldvt, @@ -802,17 +802,17 @@ static doublereal c_b109 = 0.; integer i__, j; extern logical lsame_(char *, char *); integer iltgk, itemp, minmn; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer itaup, itauq, iutgk, itgkz, mnthr; logical wantu; integer id, ie; - extern /* Subroutine */ int dgebrd_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgebrd_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -820,16 +820,16 @@ static doublereal c_b109 = 0.; doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); doublereal bignum, abstol; - extern /* Subroutine */ int dormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormbr_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); char rngtgk[1]; - extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormqr_(char *, char *, integer *, integer *, integer *, @@ -839,7 +839,7 @@ static doublereal c_b109 = 0.; doublereal smlnum; logical lquery, wantvt; doublereal dum[1], eps; - extern /* Subroutine */ int dbdsvdx_(char *, char *, char *, integer *, + extern /* Subroutine */ void dbdsvdx_(char *, char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -1068,15 +1068,15 @@ static doublereal c_b109 = 0.; if (*info != 0) { i__2 = -(*info); xerbla_("DGESVDX", &i__2, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set singular values indices accord to RANGE. */ @@ -1458,7 +1458,7 @@ static doublereal c_b109 = 0.; work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGESVDX */ diff --git a/lapack-netlib/SRC/dgesvj.c b/lapack-netlib/SRC/dgesvj.c index 8582ded18b..88ee26ce6c 100644 --- a/lapack-netlib/SRC/dgesvj.c +++ b/lapack-netlib/SRC/dgesvj.c @@ -853,7 +853,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgesvj_(char *joba, char *jobu, char *jobv, integer *m, +/* Subroutine */ void dgesvj_(char *joba, char *jobu, char *jobv, integer *m, integer *n, doublereal *a, integer *lda, doublereal *sva, integer *mv, doublereal *v, integer *ldv, doublereal *work, integer *lwork, integer *info) @@ -875,27 +875,27 @@ f"> */ doublereal temp1; integer i__, p, q; doublereal t; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal large, apoaq, aqoap; extern logical lsame_(char *, char *); doublereal theta, small, sfmin; logical lsvec; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal fastr[5]; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal epsln; logical applv, rsvec; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical uctol; - extern /* Subroutine */ int drotm_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drotm_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *); logical lower, upper, rotok; integer n2, n4; - extern /* Subroutine */ int dgsvj0_(char *, integer *, integer *, + extern /* Subroutine */ void dgsvj0_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dgsvj1_( @@ -908,16 +908,16 @@ f"> */ doublereal cs; extern doublereal dlamch_(char *); doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband, blskip; doublereal mxaapq; - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal thsign, mxsinj; integer ir1, emptsw, notrot, iswrot, jbc; @@ -1001,13 +1001,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGESVJ", &i__1, (ftnlen)6); - return 0; + return; } /* #:) Quick return for void matrix */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set numerical parameters */ @@ -1049,7 +1049,7 @@ f"> */ *info = -4; i__1 = -(*info); xerbla_("DGESVJ", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize the right singular vector matrix. */ @@ -1087,7 +1087,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("DGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1117,7 +1117,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("DGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1147,7 +1147,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("DGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1203,7 +1203,7 @@ f"> */ work[4] = 0.; work[5] = 0.; work[6] = 0.; - return 0; + return; } /* #:) Quick return for one-column matrix */ @@ -1223,7 +1223,7 @@ f"> */ work[4] = 0.; work[5] = 0.; work[6] = 0.; - return 0; + return; } /* Protect small singular values from underflow, and try to */ @@ -2352,6 +2352,6 @@ f"> */ /* MXSINJ is the largest absolute value of the sines of Jacobi angles */ /* in the last sweep */ - return 0; + return; } /* dgesvj_ */ diff --git a/lapack-netlib/SRC/dgesvx.c b/lapack-netlib/SRC/dgesvx.c index 46e72aa7e5..646436f891 100644 --- a/lapack-netlib/SRC/dgesvx.c +++ b/lapack-netlib/SRC/dgesvx.c @@ -856,7 +856,7 @@ f"> */ /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dgesvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void dgesvx_(char *fact, char *trans, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * @@ -877,23 +877,24 @@ f"> */ logical equil; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlaqge_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlaqge_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, char *), dgecon_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); doublereal colcnd; logical nofact; - extern /* Subroutine */ int dgeequ_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeequ_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dgerfs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, integer *, integer *), - dgetrf_(integer *, integer *, doublereal *, integer *, integer *, - integer *), dlacpy_(char *, integer *, integer *, doublereal *, - integer *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *, integer *); + extern int dgetrf_(integer *, integer *, doublereal *, integer *, integer *, + integer *); + extern void dlacpy_(char *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; extern doublereal dlantr_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); @@ -1030,7 +1031,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGESVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1101,7 +1102,7 @@ f"> */ } work[1] = rpvgrw; *rcond = 0.; - return 0; + return; } } @@ -1184,7 +1185,7 @@ f"> */ if (*rcond < dlamch_("Epsilon")) { *info = *n + 1; } - return 0; + return; /* End of DGESVX */ diff --git a/lapack-netlib/SRC/dgesvxx.c b/lapack-netlib/SRC/dgesvxx.c index eea6eab068..6c80d6b6dd 100644 --- a/lapack-netlib/SRC/dgesvxx.c +++ b/lapack-netlib/SRC/dgesvxx.c @@ -1045,7 +1045,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dgesvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void dgesvxx_(char *fact, char *trans, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * @@ -1069,25 +1069,26 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ doublereal rcmin, rcmax; logical equil; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlaqge_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlaqge_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, char *); doublereal colcnd; logical nofact; - extern /* Subroutine */ int dgetrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgetrf_(integer *, integer *, doublereal *, integer *, integer *, integer *), dlacpy_(char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer infequ; logical colequ; - extern /* Subroutine */ int dgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void dgetrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal rowcnd; logical notran; doublereal smlnum; logical rowequ; - extern /* Subroutine */ int dlascl2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlascl2_(integer *, integer *, doublereal *, doublereal *, integer *), dgeequb_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dgerfsx_(char *, char *, @@ -1231,7 +1232,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGESVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1297,7 +1298,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = dla_gerpvgrw__(n, info, &a[a_offset], lda, &af[ af_offset], ldaf); - return 0; + return; } } @@ -1328,7 +1329,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dlascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of DGESVXX */ } /* dgesvxx_ */ diff --git a/lapack-netlib/SRC/dgetc2.c b/lapack-netlib/SRC/dgetc2.c index 7d79601eb7..1c33db5dc5 100644 --- a/lapack-netlib/SRC/dgetc2.c +++ b/lapack-netlib/SRC/dgetc2.c @@ -625,7 +625,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int dgetc2_(integer *n, doublereal *a, integer *lda, integer +/* Subroutine */ void dgetc2_(integer *n, doublereal *a, integer *lda, integer *ipiv, integer *jpiv, integer *info) { /* System generated locals */ @@ -633,12 +633,12 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal smin, xmax; integer i__, j; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); integer ip, jp; @@ -668,7 +668,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -687,7 +687,7 @@ f"> */ *info = 1; a[a_dim1 + 1] = smlnum; } - return 0; + return; } /* Factorize A using complete pivoting. */ @@ -762,7 +762,7 @@ f"> */ ipiv[*n] = *n; jpiv[*n] = *n; - return 0; + return; /* End of DGETC2 */ diff --git a/lapack-netlib/SRC/dgetf2.c b/lapack-netlib/SRC/dgetf2.c index c3eef92661..e0f7dd74cb 100644 --- a/lapack-netlib/SRC/dgetf2.c +++ b/lapack-netlib/SRC/dgetf2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgetf2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgetf2_(integer *m, integer *n, doublereal *a, integer * lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -631,14 +631,14 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sfmin; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); integer jp; @@ -675,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGETF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Compute machine safe minimum */ @@ -736,7 +736,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of DGETF2 */ diff --git a/lapack-netlib/SRC/dgetrf.c b/lapack-netlib/SRC/dgetrf.c index 89472f0e1c..6e3faef5b9 100644 --- a/lapack-netlib/SRC/dgetrf.c +++ b/lapack-netlib/SRC/dgetrf.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgetrf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgetrf_(integer *m, integer *n, doublereal *a, integer * lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -632,18 +632,18 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer jb, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlaswp_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlaswp_(integer *, doublereal *, integer *, integer *, integer *, integer *, integer *), dgetrf2_(integer *, integer *, doublereal *, integer *, integer *, integer *); @@ -677,13 +677,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGETRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -761,7 +761,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of DGETRF */ diff --git a/lapack-netlib/SRC/dgetrf2.c b/lapack-netlib/SRC/dgetrf2.c index 1aa89f1142..802868e7de 100644 --- a/lapack-netlib/SRC/dgetrf2.c +++ b/lapack-netlib/SRC/dgetrf2.c @@ -625,7 +625,7 @@ static doublereal c_b16 = -1.; /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgetrf2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dgetrf2_(integer *m, integer *n, doublereal *a, integer * lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -635,19 +635,20 @@ static doublereal c_b16 = -1.; /* Local variables */ doublereal temp; integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; doublereal sfmin; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer n1, n2; extern doublereal dlamch_(char *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlaswp_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dlaswp_( integer *, doublereal *, integer *, integer *, integer *, integer *, integer *); @@ -681,13 +682,13 @@ static doublereal c_b16 = -1.; if (*info != 0) { i__1 = -(*info); xerbla_("DGETRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (*m == 1) { @@ -799,7 +800,7 @@ static doublereal c_b16 = -1.; dlaswp_(&n1, &a[a_dim1 + 1], lda, &i__1, &i__2, &ipiv[1], &c__1); } - return 0; + return; /* End of DGETRF2 */ diff --git a/lapack-netlib/SRC/dgetri.c b/lapack-netlib/SRC/dgetri.c index e1e0b74ce9..6f5b386f6e 100644 --- a/lapack-netlib/SRC/dgetri.c +++ b/lapack-netlib/SRC/dgetri.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgetri_(integer *n, doublereal *a, integer *lda, integer +/* Subroutine */ void dgetri_(integer *n, doublereal *a, integer *lda, integer *ipiv, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -639,14 +639,14 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nbmin; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -697,15 +697,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGETRI", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form inv(U). If INFO > 0 from DTRTRI, then U is singular, */ @@ -713,7 +713,7 @@ f"> */ dtrtri_("Upper", "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } nbmin = 2; @@ -810,7 +810,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DGETRI */ diff --git a/lapack-netlib/SRC/dgetrs.c b/lapack-netlib/SRC/dgetrs.c index e2d312fa15..50b962b61d 100644 --- a/lapack-netlib/SRC/dgetrs.c +++ b/lapack-netlib/SRC/dgetrs.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgetrs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void dgetrs_(char *trans, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, integer *info) { @@ -645,10 +645,11 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, - doublereal *, integer *), xerbla_( - char *, integer *, ftnlen), dlaswp_(integer *, doublereal *, + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlaswp_(integer *, doublereal *, integer *, integer *, integer *, integer *, integer *); logical notran; @@ -691,13 +692,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGETRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (notran) { @@ -736,7 +737,7 @@ f"> */ dlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1); } - return 0; + return; /* End of DGETRS */ diff --git a/lapack-netlib/SRC/dgetsls.c b/lapack-netlib/SRC/dgetsls.c index b2138e686c..221c4d5e04 100644 --- a/lapack-netlib/SRC/dgetsls.c +++ b/lapack-netlib/SRC/dgetsls.c @@ -674,7 +674,7 @@ static integer c__0 = 0; /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dgetsls_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void dgetsls_(char *trans, integer *m, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *work, integer *lwork, integer *info) { @@ -685,27 +685,28 @@ static integer c__0 = 0; doublereal anrm, bnrm; logical tran; integer brow, tszm, tszo, info2, i__, j, iascl, ibscl; - extern /* Subroutine */ int dgelq_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelq_(integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgeqr_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqr_(integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer minmn, maxmn; doublereal workq[1]; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal tq[5]; - extern /* Subroutine */ int dgemlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer *, doublereal - *, doublereal *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen), dgemqr_(char *, char *, integer *, integer *, + *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dgemqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer scllen; @@ -819,7 +820,7 @@ static integer c__0 = 0; i__1 = -(*info); xerbla_("DGETSLS", &i__1, (ftnlen)7); work[1] = (doublereal) wsizeo; - return 0; + return; } if (lquery) { if (*lwork == -1) { @@ -828,7 +829,7 @@ static integer c__0 = 0; if (*lwork == -2) { work[1] = (real) wsizem; } - return 0; + return; } if (*lwork < wsizeo) { lw1 = tszm; @@ -845,7 +846,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); dlaset_("FULL", &i__1, nrhs, &c_b23, &c_b23, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -922,7 +923,7 @@ static integer c__0 = 0; dtrtrs_("U", "N", "N", n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; } else { @@ -935,7 +936,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = ZERO */ @@ -978,7 +979,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1019,7 +1020,7 @@ static integer c__0 = 0; lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1047,7 +1048,7 @@ static integer c__0 = 0; L50: work[1] = (doublereal) (tszo + lwo); - return 0; + return; /* End of DGETSLS */ diff --git a/lapack-netlib/SRC/dgetsqrhrt.c b/lapack-netlib/SRC/dgetsqrhrt.c index 2194683813..ae9e16bb92 100644 --- a/lapack-netlib/SRC/dgetsqrhrt.c +++ b/lapack-netlib/SRC/dgetsqrhrt.c @@ -689,7 +689,7 @@ hrt.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgetsqrhrt_(integer *m, integer *n, integer *mb1, +/* Subroutine */ void dgetsqrhrt_(integer *m, integer *n, integer *mb1, integer *nb1, integer *nb2, doublereal *a, integer *lda, doublereal * t, integer *ldt, doublereal *work, integer *lwork, integer *info) { @@ -699,17 +699,18 @@ hrt.f"> */ /* Local variables */ integer ldwt, lworkopt, i__, j; - extern /* Subroutine */ int dorgtsqr_row_(integer *, integer *, integer * + extern /* Subroutine */ void dorgtsqr_row_(integer *, integer *, integer * , integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dorhr_col_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; integer lw1, lw2, num_all_row_blocks__, lwt; - extern /* Subroutine */ int dlatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dlatsqr_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer nb1local, nb2local; @@ -812,17 +813,17 @@ hrt.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGETSQRHRT", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { work[1] = (doublereal) lworkopt; - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { work[1] = (doublereal) lworkopt; - return 0; + return; } nb2local = f2cmin(*nb2,*n); @@ -884,7 +885,7 @@ hrt.f"> */ } work[1] = (doublereal) lworkopt; - return 0; + return; /* End of DGETSQRHRT */ diff --git a/lapack-netlib/SRC/dggbak.c b/lapack-netlib/SRC/dggbak.c index 2e5d8cebe1..a81fe75e97 100644 --- a/lapack-netlib/SRC/dggbak.c +++ b/lapack-netlib/SRC/dggbak.c @@ -655,7 +655,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggbak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void dggbak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, doublereal *lscale, doublereal *rscale, integer *m, doublereal *v, integer *ldv, integer *info) { @@ -664,10 +664,10 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical leftv; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -720,19 +720,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -838,7 +838,7 @@ f"> */ L110: - return 0; + return; /* End of DGGBAK */ diff --git a/lapack-netlib/SRC/dggbal.c b/lapack-netlib/SRC/dggbal.c index 0eea86f182..422dc2d1eb 100644 --- a/lapack-netlib/SRC/dggbal.c +++ b/lapack-netlib/SRC/dggbal.c @@ -691,7 +691,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggbal_(char *job, integer *n, doublereal *a, integer * +/* Subroutine */ void dggbal_(char *job, integer *n, doublereal *a, integer * lda, doublereal *b, integer *ldb, integer *ilo, integer *ihi, doublereal *lscale, doublereal *rscale, doublereal *work, integer * info) @@ -710,14 +710,14 @@ f"> */ doublereal coef2, coef5; integer i__, j, k, l, m; doublereal gamma, t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal sfmin, sfmax; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer iflow; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer kount, jc; doublereal ta, tb, tc; @@ -770,7 +770,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGBAL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -778,7 +778,7 @@ f"> */ if (*n == 0) { *ilo = 1; *ihi = *n; - return 0; + return; } if (*n == 1) { @@ -786,7 +786,7 @@ f"> */ *ihi = *n; lscale[1] = 1.; rscale[1] = 1.; - return 0; + return; } if (lsame_(job, "N")) { @@ -798,7 +798,7 @@ f"> */ rscale[i__] = 1.; /* L10: */ } - return 0; + return; } k = 1; @@ -931,11 +931,11 @@ f"> */ rscale[i__] = 1.; /* L195: */ } - return 0; + return; } if (*ilo == *ihi) { - return 0; + return; } /* Balance the submatrix in rows ILO to IHI. */ @@ -1187,7 +1187,7 @@ f"> */ /* L380: */ } - return 0; + return; /* End of DGGBAL */ diff --git a/lapack-netlib/SRC/dgges.c b/lapack-netlib/SRC/dgges.c index 0bdd2dbb60..399d12bc68 100644 --- a/lapack-netlib/SRC/dgges.c +++ b/lapack-netlib/SRC/dgges.c @@ -798,7 +798,7 @@ or GE matrices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgges_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, @@ -819,7 +819,7 @@ or GE matrices */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dggbak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dggbak_( char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, @@ -829,23 +829,23 @@ or GE matrices */ integer ip; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, @@ -859,12 +859,12 @@ or GE matrices */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvr; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal anrmto, bnrmto; logical lastsl; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer minwrk, maxwrk; @@ -990,16 +990,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGES ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1283,7 +1283,7 @@ or GE matrices */ work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGGES */ diff --git a/lapack-netlib/SRC/dgges3.c b/lapack-netlib/SRC/dgges3.c index 76ff7c6acc..89fca03268 100644 --- a/lapack-netlib/SRC/dgges3.c +++ b/lapack-netlib/SRC/dgges3.c @@ -796,7 +796,7 @@ f"> */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, @@ -816,12 +816,12 @@ f"> */ extern logical lsame_(char *, char *); integer ileft, icols; logical cursl, ilvsl, ilvsr; - extern /* Subroutine */ int dgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghd3_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dggbak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dggbak_( char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, @@ -831,21 +831,21 @@ f"> */ integer ip; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, @@ -856,12 +856,12 @@ f"> */ doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *); integer ijobvl, iright, ijobvr; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal anrmto, bnrmto; logical lastsl; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal smlnum; @@ -998,16 +998,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGES3 ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1285,7 +1285,7 @@ f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGGES3 */ diff --git a/lapack-netlib/SRC/dggesx.c b/lapack-netlib/SRC/dggesx.c index 1a87a12c24..995b5f01e2 100644 --- a/lapack-netlib/SRC/dggesx.c +++ b/lapack-netlib/SRC/dggesx.c @@ -878,7 +878,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, char *sense, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, @@ -899,7 +899,7 @@ f"> */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *), dggbak_( + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *), dggbak_( char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, @@ -910,31 +910,31 @@ f"> */ extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); doublereal pl; - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal pr; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer ijobvl, iright; - extern /* Subroutine */ int dtgsen_(integer *, logical *, logical *, + extern /* Subroutine */ void dtgsen_(integer *, logical *, logical *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, @@ -947,13 +947,13 @@ f"> */ integer liwmin; logical wantse, lastsl; doublereal anrmto, bnrmto; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer minwrk, maxwrk; logical wantsn; doublereal smlnum; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical wantst, lquery, wantsv; @@ -1111,16 +1111,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1432,7 +1432,7 @@ f"> */ work[1] = (doublereal) maxwrk; iwork[1] = liwmin; - return 0; + return; /* End of DGGESX */ diff --git a/lapack-netlib/SRC/dggev.c b/lapack-netlib/SRC/dggev.c index 792053cebe..7a8f34bb89 100644 --- a/lapack-netlib/SRC/dggev.c +++ b/lapack-netlib/SRC/dggev.c @@ -743,7 +743,7 @@ ices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dggev_(char *jobvl, char *jobvr, integer *n, doublereal * +/* Subroutine */ void dggev_(char *jobvl, char *jobvr, integer *n, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, doublereal *work, integer *lwork, @@ -762,9 +762,9 @@ ices */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer jc; - extern /* Subroutine */ int dggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, @@ -773,13 +773,13 @@ ices */ extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer jr; - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, @@ -790,19 +790,20 @@ ices */ logical ldumma[1]; char chtemp[1]; doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvl, iright, ijobvr; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal anrmto, bnrmto; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer minwrk, maxwrk; @@ -922,15 +923,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1218,7 +1219,7 @@ ices */ } work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGGEV */ diff --git a/lapack-netlib/SRC/dggev3.c b/lapack-netlib/SRC/dggev3.c index f4cf9f1924..fa8f27dbfa 100644 --- a/lapack-netlib/SRC/dggev3.c +++ b/lapack-netlib/SRC/dggev3.c @@ -742,7 +742,7 @@ f"> */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ int dggev3_(char *jobvl, char *jobvr, integer *n, doublereal +/* Subroutine */ void dggev3_(char *jobvl, char *jobvr, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, doublereal *work, integer *lwork, @@ -761,14 +761,14 @@ f"> */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols; - extern /* Subroutine */ int dgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghd3_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer jc; - extern /* Subroutine */ int dggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, @@ -777,11 +777,11 @@ f"> */ extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer jr; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, @@ -792,17 +792,18 @@ f"> */ logical ldumma[1]; char chtemp[1]; doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ijobvl, iright, ijobvr; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal anrmto, bnrmto; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal smlnum; @@ -947,15 +948,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGEV3 ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1238,7 +1239,7 @@ f"> */ } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGGEV3 */ diff --git a/lapack-netlib/SRC/dggevx.c b/lapack-netlib/SRC/dggevx.c index b3ef245e44..213beaf844 100644 --- a/lapack-netlib/SRC/dggevx.c +++ b/lapack-netlib/SRC/dggevx.c @@ -905,7 +905,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void dggevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *alphar, doublereal *alphai, doublereal * beta, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, @@ -930,9 +930,9 @@ f"> */ integer icols; logical noscl; integer irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer jc; - extern /* Subroutine */ int dggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dggbal_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, @@ -943,46 +943,46 @@ f"> */ extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer jr; - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); logical ldumma[1]; char chtemp[1]; doublereal bignum; - extern /* Subroutine */ int dhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void dhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer ijobvl; - extern /* Subroutine */ int dtgevc_(char *, char *, logical *, integer *, + extern /* Subroutine */ void dtgevc_(char *, char *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *), dtgsna_(char *, char *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvr; logical wantsb; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal anrmto; logical wantse; doublereal bnrmto; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer minwrk, maxwrk; @@ -1135,15 +1135,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } @@ -1512,7 +1512,7 @@ f"> */ } work[1] = (doublereal) maxwrk; - return 0; + return; /* End of DGGEVX */ diff --git a/lapack-netlib/SRC/dggglm.c b/lapack-netlib/SRC/dggglm.c index e2de180371..1299b34412 100644 --- a/lapack-netlib/SRC/dggglm.c +++ b/lapack-netlib/SRC/dggglm.c @@ -700,7 +700,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dggglm_(integer *n, integer *m, integer *p, doublereal * +/* Subroutine */ void dggglm_(integer *n, integer *m, integer *p, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *d__, doublereal *x, doublereal *y, doublereal *work, integer *lwork, integer *info) @@ -710,19 +710,19 @@ f"> */ /* Local variables */ integer lopt, i__; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer nb, np; - extern /* Subroutine */ int dggqrf_(integer *, integer *, integer *, + extern /* Subroutine */ void dggqrf_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - doublereal *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer lwkmin, nb1, nb2, nb3, nb4; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormrq_(char *, char *, integer *, integer *, integer *, @@ -805,9 +805,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGGLM", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -821,7 +821,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.; } - return 0; + return; } /* Compute the GQR factorization of matrices A and B: */ @@ -860,7 +860,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } i__1 = *n - *m; @@ -889,7 +889,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Copy D to X */ @@ -909,7 +909,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[*m + np + 1]; work[1] = (doublereal) (*m + np + f2cmax(i__1,i__2)); - return 0; + return; /* End of DGGGLM */ diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index d43785d32d..ae0f0e908c 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -288,7 +288,7 @@ SUBROUTINE DGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, * CALL DGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = WORK( M+NP+1 ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**T*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/dgghd3.c b/lapack-netlib/SRC/dgghd3.c index 3a14634589..c4507b8cbd 100644 --- a/lapack-netlib/SRC/dgghd3.c +++ b/lapack-netlib/SRC/dgghd3.c @@ -748,7 +748,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgghd3_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void dgghd3_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *q, integer *ldq, doublereal *z__, integer * ldz, doublereal *work, integer *lwork, integer *info) @@ -762,21 +762,21 @@ f"> */ logical blk22; integer cola, jcol, ierr; doublereal temp; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer jrow, topq, ppwo; doublereal temp1, temp2, temp3, c__; integer kacc22, i__, j, k; doublereal s; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nbmin; - extern /* Subroutine */ int dorm22_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dorm22_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer nblst; @@ -784,25 +784,25 @@ f"> */ doublereal c1, c2; logical wantq; integer j0; - extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); logical initz, wantz; doublereal s1, s2; char compq2[1], compz2[1]; integer nb, jj, nh; - extern /* Subroutine */ int dgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgghrd_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer nx, pw; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; integer nnb, len, top, ppw, n2nb; @@ -873,9 +873,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGHD3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -900,7 +900,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1] = 1.; - return 0; + return; } /* Determine the blocksize. */ @@ -1573,7 +1573,7 @@ f"> */ } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGGHD3 */ diff --git a/lapack-netlib/SRC/dgghrd.c b/lapack-netlib/SRC/dgghrd.c index 91e3ebfd01..03879f33e6 100644 --- a/lapack-netlib/SRC/dgghrd.c +++ b/lapack-netlib/SRC/dgghrd.c @@ -721,7 +721,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dgghrd_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void dgghrd_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *q, integer *ldq, doublereal *z__, integer * ldz, integer *info) @@ -733,15 +733,16 @@ f"> */ /* Local variables */ integer jcol; doublereal temp; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer jrow; doublereal c__, s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); integer icompq, icompz; logical ilq, ilz; @@ -825,7 +826,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGHRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -840,7 +841,7 @@ f"> */ /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Zero out lower triangle of B */ @@ -900,7 +901,7 @@ f"> */ /* L40: */ } - return 0; + return; /* End of DGGHRD */ diff --git a/lapack-netlib/SRC/dgglse.c b/lapack-netlib/SRC/dgglse.c index 15650c6902..e9b5550735 100644 --- a/lapack-netlib/SRC/dgglse.c +++ b/lapack-netlib/SRC/dgglse.c @@ -695,7 +695,7 @@ f"> */ /* > \ingroup doubleOTHERsolve */ /* ===================================================================== */ -/* Subroutine */ int dgglse_(integer *m, integer *n, integer *p, doublereal * +/* Subroutine */ void dgglse_(integer *m, integer *n, integer *p, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *c__, doublereal *d__, doublereal *x, doublereal *work, integer *lwork, integer *info) @@ -705,7 +705,7 @@ f"> */ /* Local variables */ integer lopt; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer @@ -713,14 +713,14 @@ f"> */ , dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); integer nb, mn, nr; - extern /* Subroutine */ int dggrqf_(integer *, integer *, integer *, + extern /* Subroutine */ void dggrqf_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, - doublereal *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer lwkmin, nb1, nb2, nb3, nb4; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormrq_(char *, char *, integer *, integer *, integer *, @@ -803,15 +803,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGLSE", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the GRQ factorization of matrices B and A: */ @@ -847,7 +847,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } /* Put the solution in X */ @@ -871,7 +871,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Put the solutions in X */ @@ -908,7 +908,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[*p + mn + 1]; work[1] = (doublereal) (*p + mn + f2cmax(i__1,i__2)); - return 0; + return; /* End of DGGLSE */ diff --git a/lapack-netlib/SRC/dgglse.f b/lapack-netlib/SRC/dgglse.f index 2fd17bbcb5..28aeaf6e76 100644 --- a/lapack-netlib/SRC/dgglse.f +++ b/lapack-netlib/SRC/dgglse.f @@ -276,7 +276,7 @@ SUBROUTINE DGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, * CALL DGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = WORK( P+MN+1 ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**T *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/dggqrf.c b/lapack-netlib/SRC/dggqrf.c index 19222570c8..535db1cce3 100644 --- a/lapack-netlib/SRC/dggqrf.c +++ b/lapack-netlib/SRC/dggqrf.c @@ -728,7 +728,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggqrf_(integer *n, integer *m, integer *p, doublereal * +/* Subroutine */ void dggqrf_(integer *n, integer *m, integer *p, doublereal * a, integer *lda, doublereal *taua, doublereal *b, integer *ldb, doublereal *taub, doublereal *work, integer *lwork, integer *info) { @@ -737,14 +737,15 @@ f"> */ /* Local variables */ integer lopt, nb; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dgerqf_(integer *, integer *, doublereal *, integer *, doublereal - *, doublereal *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer nb1, nb2, nb3; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; @@ -808,9 +809,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGGQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* QR factorization of N-by-M matrix A: A = Q*R */ @@ -834,7 +835,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[1]; work[1] = (doublereal) f2cmax(i__1,i__2); - return 0; + return; /* End of DGGQRF */ diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f index 617af274ff..39d27a5c93 100644 --- a/lapack-netlib/SRC/dggqrf.f +++ b/lapack-netlib/SRC/dggqrf.f @@ -276,7 +276,7 @@ SUBROUTINE DGGQRF( N, M, P, A, LDA, TAUA, B, LDB, TAUB, WORK, * QR factorization of N-by-M matrix A: A = Q*R * CALL DGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**T*B. * diff --git a/lapack-netlib/SRC/dggrqf.c b/lapack-netlib/SRC/dggrqf.c index 3e3961c598..9b764d7b86 100644 --- a/lapack-netlib/SRC/dggrqf.c +++ b/lapack-netlib/SRC/dggrqf.c @@ -727,7 +727,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggrqf_(integer *m, integer *p, integer *n, doublereal * +/* Subroutine */ void dggrqf_(integer *m, integer *p, integer *n, doublereal * a, integer *lda, doublereal *taua, doublereal *b, integer *ldb, doublereal *taub, doublereal *work, integer *lwork, integer *info) { @@ -736,14 +736,15 @@ f"> */ /* Local variables */ integer lopt, nb; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dgerqf_(integer *, integer *, doublereal *, integer *, doublereal - *, doublereal *, integer *, integer *), xerbla_(char *, integer *); + *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer nb1, nb2, nb3; - extern /* Subroutine */ int dormrq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormrq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; @@ -806,10 +807,10 @@ f"> */ } if (*info != 0) { i__1 = -(*info); - xerbla_("DGGRQF", &i__1); - return 0; + xerbla_("DGGRQF", &i__1, 6); + return; } else if (lquery) { - return 0; + return; } /* RQ factorization of M-by-N matrix A: A = R*Q */ @@ -835,7 +836,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[1]; work[1] = (doublereal) f2cmax(i__1,i__2); - return 0; + return; /* End of DGGRQF */ diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f index 07f8752d80..ddf4104c59 100644 --- a/lapack-netlib/SRC/dggrqf.f +++ b/lapack-netlib/SRC/dggrqf.f @@ -275,7 +275,7 @@ SUBROUTINE DGGRQF( M, P, N, A, LDA, TAUA, B, LDB, TAUB, WORK, * RQ factorization of M-by-N matrix A: A = R*Q * CALL DGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**T * diff --git a/lapack-netlib/SRC/dggsvd3.c b/lapack-netlib/SRC/dggsvd3.c index 4e0572dcaa..347d0e08ab 100644 --- a/lapack-netlib/SRC/dggsvd3.c +++ b/lapack-netlib/SRC/dggsvd3.c @@ -861,7 +861,7 @@ static integer c__1 = 1; /* > DGGSVD3 replaces the deprecated subroutine DGGSVD. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void dggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *alpha, doublereal *beta, doublereal *u, integer *ldu, doublereal *v, integer @@ -880,20 +880,21 @@ static integer c__1 = 1; integer ncallmycycle, i__, j; extern logical lsame_(char *, char *); doublereal anorm, bnorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantq, wantu, wantv; extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dtgsja_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtgsja_(char *, char *, char *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int dggsvp3_(char *, char *, char *, integer *, + extern /* Subroutine */ void dggsvp3_(char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -988,10 +989,10 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DGGSVD3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1054,7 +1055,7 @@ static integer c__1 = 1; } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGGSVD3 */ diff --git a/lapack-netlib/SRC/dggsvp3.c b/lapack-netlib/SRC/dggsvp3.c index 6127bb0e38..b07984e115 100644 --- a/lapack-netlib/SRC/dggsvp3.c +++ b/lapack-netlib/SRC/dggsvp3.c @@ -785,7 +785,7 @@ static doublereal c_b24 = 1.; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void dggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *tola, doublereal *tolb, integer *k, integer *l, doublereal *u, integer *ldu, doublereal *v, integer *ldv, @@ -801,7 +801,7 @@ static doublereal c_b24 = 1.; integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int dgeqp3_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqp3_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, integer *), dgeqr2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dgerq2_(integer *, @@ -814,8 +814,9 @@ static doublereal c_b24 = 1.; integer *, doublereal *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dlapmt_(logical *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlapmt_(logical *, integer *, integer *, doublereal *, integer *, integer *); logical forwrd; integer lwkopt; @@ -919,10 +920,10 @@ static doublereal c_b24 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("DGGSVP3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1174,7 +1175,7 @@ static doublereal c_b24 = 1.; } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DGGSVP3 */ diff --git a/lapack-netlib/SRC/dgsvj0.c b/lapack-netlib/SRC/dgsvj0.c index fe8dc93eb7..4dc5985781 100644 --- a/lapack-netlib/SRC/dgsvj0.c +++ b/lapack-netlib/SRC/dgsvj0.c @@ -732,7 +732,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* ===================================================================== */ -/* Subroutine */ int dgsvj0_(char *jobv, integer *m, integer *n, doublereal * +/* Subroutine */ void dgsvj0_(char *jobv, integer *m, integer *n, doublereal * a, integer *lda, doublereal *d__, doublereal *sva, integer *mv, doublereal *v, integer *ldv, doublereal *eps, doublereal *sfmin, doublereal *tol, integer *nsweep, doublereal *work, integer *lwork, @@ -757,25 +757,25 @@ f"> */ doublereal t, apoaq, aqoap; extern logical lsame_(char *, char *); doublereal theta, small; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal fastr[5]; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical applv, rsvec; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), drotm_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *); logical rotok; doublereal rootsfmin, cs, sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband, blskip; doublereal mxaapq; - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal thsign, mxsinj; integer ir1, emptsw, notrot, iswrot, jbc; @@ -837,7 +837,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGSVJ0", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1720,6 +1720,6 @@ f"> */ /* L5991: */ } - return 0; + return; } /* dgsvj0_ */ diff --git a/lapack-netlib/SRC/dgsvj1.c b/lapack-netlib/SRC/dgsvj1.c index 1cfde9dfcf..b191834ed9 100644 --- a/lapack-netlib/SRC/dgsvj1.c +++ b/lapack-netlib/SRC/dgsvj1.c @@ -751,7 +751,7 @@ f"> */ /* > Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany) */ /* ===================================================================== */ -/* Subroutine */ int dgsvj1_(char *jobv, integer *m, integer *n, integer *n1, +/* Subroutine */ void dgsvj1_(char *jobv, integer *m, integer *n, integer *n1, doublereal *a, integer *lda, doublereal *d__, doublereal *sva, integer *mv, doublereal *v, integer *ldv, doublereal *eps, doublereal *sfmin, doublereal *tol, integer *nsweep, doublereal *work, integer * @@ -777,25 +777,25 @@ f"> */ doublereal t, large, apoaq, aqoap; extern logical lsame_(char *, char *); doublereal theta, small; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal fastr[5]; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical applv, rsvec; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), drotm_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *); logical rotok; doublereal rootsfmin, cs, sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband, blskip; doublereal mxaapq; - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal thsign, mxsinj; integer emptsw, notrot, iswrot, jbc; @@ -859,7 +859,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGSVJ1", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1358,6 +1358,6 @@ f"> */ /* L5991: */ } - return 0; + return; } /* dgsvj1_ */ diff --git a/lapack-netlib/SRC/dgtcon.c b/lapack-netlib/SRC/dgtcon.c index 44f0c0e2c4..1244943e8c 100644 --- a/lapack-netlib/SRC/dgtcon.c +++ b/lapack-netlib/SRC/dgtcon.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup doubleGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgtcon_(char *norm, integer *n, doublereal *dl, +/* Subroutine */ void dgtcon_(char *norm, integer *n, doublereal *dl, doublereal *d__, doublereal *du, doublereal *du2, integer *ipiv, doublereal *anorm, doublereal *rcond, doublereal *work, integer * iwork, integer *info) @@ -670,12 +670,12 @@ f"> */ integer kase, kase1, i__; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; logical onenrm; - extern /* Subroutine */ int dgttrs_(char *, integer *, integer *, + extern /* Subroutine */ void dgttrs_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -713,7 +713,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -721,9 +721,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } /* Check that D(1:N) is non-zero. */ @@ -731,7 +731,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (d__[i__] == 0.) { - return 0; + return; } /* L10: */ } @@ -768,7 +768,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of DGTCON */ diff --git a/lapack-netlib/SRC/dgtrfs.c b/lapack-netlib/SRC/dgtrfs.c index bf53d365d7..81add9aa0c 100644 --- a/lapack-netlib/SRC/dgtrfs.c +++ b/lapack-netlib/SRC/dgtrfs.c @@ -722,7 +722,7 @@ f"> */ /* > \ingroup doubleGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgtrfs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void dgtrfs_(char *trans, integer *n, integer *nrhs, doublereal *dl, doublereal *d__, doublereal *du, doublereal *dlf, doublereal *df, doublereal *duf, doublereal *du2, integer *ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * @@ -740,22 +740,22 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer nz; - extern /* Subroutine */ int dlagtm_(char *, integer *, integer *, + extern /* Subroutine */ void dlagtm_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran; char transn[1]; - extern /* Subroutine */ int dgttrs_(char *, integer *, integer *, + extern /* Subroutine */ void dgttrs_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); char transt[1]; @@ -811,7 +811,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -823,7 +823,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1035,7 +1035,7 @@ f"> */ /* L110: */ } - return 0; + return; /* End of DGTRFS */ diff --git a/lapack-netlib/SRC/dgtsv.c b/lapack-netlib/SRC/dgtsv.c index b5d19ad6d7..c608f1c077 100644 --- a/lapack-netlib/SRC/dgtsv.c +++ b/lapack-netlib/SRC/dgtsv.c @@ -636,7 +636,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGTsolve */ /* ===================================================================== */ -/* Subroutine */ int dgtsv_(integer *n, integer *nrhs, doublereal *dl, +/* Subroutine */ void dgtsv_(integer *n, integer *nrhs, doublereal *dl, doublereal *d__, doublereal *du, doublereal *b, integer *ldb, integer *info) { @@ -679,11 +679,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DGTSV ", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } if (*nrhs == 1) { @@ -699,7 +699,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ b[i__ + 1 + b_dim1] -= fact * b[i__ + b_dim1]; } else { *info = i__; - return 0; + return; } dl[i__] = 0.; } else { @@ -728,7 +728,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ b[i__ + 1 + b_dim1] -= fact * b[i__ + b_dim1]; } else { *info = i__; - return 0; + return; } } else { fact = d__[i__] / dl[i__]; @@ -743,7 +743,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (d__[*n] == 0.) { *info = *n; - return 0; + return; } } else { i__1 = *n - 2; @@ -762,7 +762,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } else { *info = i__; - return 0; + return; } dl[i__] = 0.; } else { @@ -800,7 +800,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } else { *info = i__; - return 0; + return; } } else { fact = d__[i__] / dl[i__]; @@ -820,7 +820,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (d__[*n] == 0.) { *info = *n; - return 0; + return; } } @@ -862,7 +862,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of DGTSV */ diff --git a/lapack-netlib/SRC/dgtsvx.c b/lapack-netlib/SRC/dgtsvx.c index 430bb1dc91..d6aaf1e17a 100644 --- a/lapack-netlib/SRC/dgtsvx.c +++ b/lapack-netlib/SRC/dgtsvx.c @@ -804,7 +804,7 @@ f"> */ /* > \ingroup doubleGTsolve */ /* ===================================================================== */ -/* Subroutine */ int dgtsvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void dgtsvx_(char *fact, char *trans, integer *n, integer * nrhs, doublereal *dl, doublereal *d__, doublereal *du, doublereal * dlf, doublereal *df, doublereal *duf, doublereal *du2, integer *ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * @@ -818,14 +818,15 @@ f"> */ char norm[1]; extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *), dlangt_(char *, integer *, doublereal *, doublereal *, doublereal *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dgtcon_(char *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dgtcon_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dgtrfs_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, @@ -834,7 +835,7 @@ f"> */ integer *, integer *), dgttrf_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); logical notran; - extern /* Subroutine */ int dgttrs_(char *, integer *, integer *, + extern /* Subroutine */ void dgttrs_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -889,7 +890,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -909,7 +910,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -946,7 +947,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of DGTSVX */ diff --git a/lapack-netlib/SRC/dgttrf.c b/lapack-netlib/SRC/dgttrf.c index f496a1badc..d4a9ebf134 100644 --- a/lapack-netlib/SRC/dgttrf.c +++ b/lapack-netlib/SRC/dgttrf.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup doubleGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgttrf_(integer *n, doublereal *dl, doublereal *d__, +/* Subroutine */ void dgttrf_(integer *n, doublereal *dl, doublereal *d__, doublereal *du, doublereal *du2, integer *ipiv, integer *info) { /* System generated locals */ @@ -668,13 +668,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("DGTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize IPIV(i) = i and DU2(I) = 0 */ @@ -748,7 +748,7 @@ f"> */ } L50: - return 0; + return; /* End of DGTTRF */ diff --git a/lapack-netlib/SRC/dgttrs.c b/lapack-netlib/SRC/dgttrs.c index 3150c6c661..818bc0e09e 100644 --- a/lapack-netlib/SRC/dgttrs.c +++ b/lapack-netlib/SRC/dgttrs.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup doubleGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgttrs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void dgttrs_(char *trans, integer *n, integer *nrhs, doublereal *dl, doublereal *d__, doublereal *du, doublereal *du2, integer *ipiv, doublereal *b, integer *ldb, integer *info) { @@ -660,7 +660,7 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int dgtts2_(integer *, integer *, integer *, + extern /* Subroutine */ void dgtts2_(integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer jb, nb; @@ -707,13 +707,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DGTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Decode TRANS */ @@ -753,6 +753,6 @@ f"> */ /* End of DGTTRS */ - return 0; + return; } /* dgttrs_ */ diff --git a/lapack-netlib/SRC/dgtts2.c b/lapack-netlib/SRC/dgtts2.c index d4cbc1a8e2..7ac39f7065 100644 --- a/lapack-netlib/SRC/dgtts2.c +++ b/lapack-netlib/SRC/dgtts2.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup doubleGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dgtts2_(integer *itrans, integer *n, integer *nrhs, +/* Subroutine */ void dgtts2_(integer *itrans, integer *n, integer *nrhs, doublereal *dl, doublereal *d__, doublereal *du, doublereal *du2, integer *ipiv, doublereal *b, integer *ldb) { @@ -673,7 +673,7 @@ f"> */ /* Function Body */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (*itrans == 0) { @@ -825,6 +825,6 @@ f"> */ /* End of DGTTS2 */ - return 0; + return; } /* dgtts2_ */ diff --git a/lapack-netlib/SRC/dhgeqz.c b/lapack-netlib/SRC/dhgeqz.c index 5e3d04e45e..e24783316e 100644 --- a/lapack-netlib/SRC/dhgeqz.c +++ b/lapack-netlib/SRC/dhgeqz.c @@ -818,7 +818,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dhgeqz_(char *job, char *compq, char *compz, integer *n, +/* Subroutine */ void dhgeqz_(char *job, char *compq, char *compz, integer *n, integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal *t, integer *ldt, doublereal *alphar, doublereal *alphai, doublereal * beta, doublereal *q, integer *ldq, doublereal *z__, integer *ldz, @@ -831,7 +831,7 @@ f"> */ /* Local variables */ doublereal ad11l, ad12l, ad21l, ad22l, ad32l, wabs, atol, btol, temp; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *), dlag2_( doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, @@ -846,7 +846,7 @@ f"> */ doublereal tempi, tempr, s1, s2, t1, u1, u2; extern doublereal dlapy2_(doublereal *, doublereal *), dlapy3_(doublereal *, doublereal *, doublereal *); - extern /* Subroutine */ int dlasv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlasv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); logical ilazr2; @@ -859,15 +859,15 @@ f"> */ doublereal cz, sl, w12, w21, w22, wi; extern doublereal dlamch_(char *); doublereal sr; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); doublereal vs, wr; extern doublereal dlanhs_(char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -994,16 +994,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DHGEQZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { work[1] = 1.; - return 0; + return; } /* Initialize Q and Z */ @@ -2101,7 +2101,7 @@ f"> */ L420: work[1] = (doublereal) (*n); - return 0; + return; /* End of DHGEQZ */ diff --git a/lapack-netlib/SRC/dhgeqz.f b/lapack-netlib/SRC/dhgeqz.f index 3fe2a083c8..d6233596c8 100644 --- a/lapack-netlib/SRC/dhgeqz.f +++ b/lapack-netlib/SRC/dhgeqz.f @@ -337,9 +337,9 @@ SUBROUTINE DHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, $ BTOL, C, C11I, C11R, C12, C21, C22I, C22R, CL, $ CQ, CR, CZ, ESHIFT, S, S1, S1INV, S2, SAFMAX, $ SAFMIN, SCALE, SL, SQI, SQR, SR, SZI, SZR, T1, - $ TAU, TEMP, TEMP2, TEMPI, TEMPR, U1, U12, U12L, - $ U2, ULP, VS, W11, W12, W21, W22, WABS, WI, WR, - $ WR2 + $ T2, T3, TAU, TEMP, TEMP2, TEMPI, TEMPR, U1, + $ U12, U12L, U2, ULP, VS, W11, W12, W21, W22, + $ WABS, WI, WR, WR2 * .. * .. Local Arrays .. DOUBLE PRECISION V( 3 ) @@ -536,9 +536,7 @@ SUBROUTINE DHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = ZERO GO TO 70 END IF @@ -564,10 +562,7 @@ SUBROUTINE DHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = ZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A @@ -1132,25 +1127,27 @@ SUBROUTINE DHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, H( J+2, J-1 ) = ZERO END IF * + T2 = TAU*V( 2 ) + T3 = TAU*V( 3 ) DO 230 JC = J, ILASTM - TEMP = TAU*( H( J, JC )+V( 2 )*H( J+1, JC )+V( 3 )* - $ H( J+2, JC ) ) - H( J, JC ) = H( J, JC ) - TEMP - H( J+1, JC ) = H( J+1, JC ) - TEMP*V( 2 ) - H( J+2, JC ) = H( J+2, JC ) - TEMP*V( 3 ) - TEMP2 = TAU*( T( J, JC )+V( 2 )*T( J+1, JC )+V( 3 )* - $ T( J+2, JC ) ) - T( J, JC ) = T( J, JC ) - TEMP2 - T( J+1, JC ) = T( J+1, JC ) - TEMP2*V( 2 ) - T( J+2, JC ) = T( J+2, JC ) - TEMP2*V( 3 ) + TEMP = H( J, JC )+V( 2 )*H( J+1, JC )+V( 3 )* + $ H( J+2, JC ) + H( J, JC ) = H( J, JC ) - TEMP*TAU + H( J+1, JC ) = H( J+1, JC ) - TEMP*T2 + H( J+2, JC ) = H( J+2, JC ) - TEMP*T3 + TEMP2 = T( J, JC )+V( 2 )*T( J+1, JC )+V( 3 )* + $ T( J+2, JC ) + T( J, JC ) = T( J, JC ) - TEMP2*TAU + T( J+1, JC ) = T( J+1, JC ) - TEMP2*T2 + T( J+2, JC ) = T( J+2, JC ) - TEMP2*T3 230 CONTINUE IF( ILQ ) THEN DO 240 JR = 1, N - TEMP = TAU*( Q( JR, J )+V( 2 )*Q( JR, J+1 )+V( 3 )* - $ Q( JR, J+2 ) ) - Q( JR, J ) = Q( JR, J ) - TEMP - Q( JR, J+1 ) = Q( JR, J+1 ) - TEMP*V( 2 ) - Q( JR, J+2 ) = Q( JR, J+2 ) - TEMP*V( 3 ) + TEMP = Q( JR, J )+V( 2 )*Q( JR, J+1 )+V( 3 )* + $ Q( JR, J+2 ) + Q( JR, J ) = Q( JR, J ) - TEMP*TAU + Q( JR, J+1 ) = Q( JR, J+1 ) - TEMP*T2 + Q( JR, J+2 ) = Q( JR, J+2 ) - TEMP*T3 240 CONTINUE END IF * @@ -1238,27 +1235,29 @@ SUBROUTINE DHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, * * Apply transformations from the right. * + T2 = TAU*V(2) + T3 = TAU*V(3) DO 260 JR = IFRSTM, MIN( J+3, ILAST ) - TEMP = TAU*( H( JR, J )+V( 2 )*H( JR, J+1 )+V( 3 )* - $ H( JR, J+2 ) ) - H( JR, J ) = H( JR, J ) - TEMP - H( JR, J+1 ) = H( JR, J+1 ) - TEMP*V( 2 ) - H( JR, J+2 ) = H( JR, J+2 ) - TEMP*V( 3 ) + TEMP = H( JR, J )+V( 2 )*H( JR, J+1 )+V( 3 )* + $ H( JR, J+2 ) + H( JR, J ) = H( JR, J ) - TEMP*TAU + H( JR, J+1 ) = H( JR, J+1 ) - TEMP*T2 + H( JR, J+2 ) = H( JR, J+2 ) - TEMP*T3 260 CONTINUE DO 270 JR = IFRSTM, J + 2 - TEMP = TAU*( T( JR, J )+V( 2 )*T( JR, J+1 )+V( 3 )* - $ T( JR, J+2 ) ) - T( JR, J ) = T( JR, J ) - TEMP - T( JR, J+1 ) = T( JR, J+1 ) - TEMP*V( 2 ) - T( JR, J+2 ) = T( JR, J+2 ) - TEMP*V( 3 ) + TEMP = T( JR, J )+V( 2 )*T( JR, J+1 )+V( 3 )* + $ T( JR, J+2 ) + T( JR, J ) = T( JR, J ) - TEMP*TAU + T( JR, J+1 ) = T( JR, J+1 ) - TEMP*T2 + T( JR, J+2 ) = T( JR, J+2 ) - TEMP*T3 270 CONTINUE IF( ILZ ) THEN DO 280 JR = 1, N - TEMP = TAU*( Z( JR, J )+V( 2 )*Z( JR, J+1 )+V( 3 )* - $ Z( JR, J+2 ) ) - Z( JR, J ) = Z( JR, J ) - TEMP - Z( JR, J+1 ) = Z( JR, J+1 ) - TEMP*V( 2 ) - Z( JR, J+2 ) = Z( JR, J+2 ) - TEMP*V( 3 ) + TEMP = Z( JR, J )+V( 2 )*Z( JR, J+1 )+V( 3 )* + $ Z( JR, J+2 ) + Z( JR, J ) = Z( JR, J ) - TEMP*TAU + Z( JR, J+1 ) = Z( JR, J+1 ) - TEMP*T2 + Z( JR, J+2 ) = Z( JR, J+2 ) - TEMP*T3 280 CONTINUE END IF T( J+1, J ) = ZERO diff --git a/lapack-netlib/SRC/dhsein.c b/lapack-netlib/SRC/dhsein.c index 18d0f02901..299fe7d547 100644 --- a/lapack-netlib/SRC/dhsein.c +++ b/lapack-netlib/SRC/dhsein.c @@ -775,7 +775,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dhsein_(char *side, char *eigsrc, char *initv, logical * +/* Subroutine */ void dhsein_(char *side, char *eigsrc, char *initv, logical * select, integer *n, doublereal *h__, integer *ldh, doublereal *wr, doublereal *wi, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, integer *mm, integer *m, doublereal *work, integer * @@ -796,7 +796,7 @@ f"> */ doublereal hnorm; integer kl; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlaein_(logical *, logical *, integer *, + extern /* Subroutine */ void dlaein_(logical *, logical *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal * , doublereal *, doublereal *, integer *); @@ -900,13 +900,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DHSEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set machine-dependent constants. */ @@ -979,7 +979,7 @@ f"> */ work[1]); if (disnan_(&hnorm)) { *info = -6; - return 0; + return; } else if (hnorm > 0.) { eps3 = hnorm * ulp; } else { @@ -1088,7 +1088,7 @@ f"> */ /* L120: */ } - return 0; + return; /* End of DHSEIN */ diff --git a/lapack-netlib/SRC/dhseqr.c b/lapack-netlib/SRC/dhseqr.c index b6d3237096..2c3f9d9ce1 100644 --- a/lapack-netlib/SRC/dhseqr.c +++ b/lapack-netlib/SRC/dhseqr.c @@ -832,7 +832,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* ===================================================================== */ -/* Subroutine */ int dhseqr_(char *job, char *compz, integer *n, integer *ilo, +/* Subroutine */ void dhseqr_(char *job, char *compz, integer *n, integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal *wr, doublereal *wi, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *info) @@ -849,12 +849,12 @@ f"> */ logical initz; doublereal workl[49]; logical wantt, wantz; - extern /* Subroutine */ int dlaqr0_(logical *, logical *, integer *, + extern /* Subroutine */ void dlaqr0_(logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal hl[2401] /* was [49][49] */; - extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void dlahqr_(logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, @@ -932,13 +932,13 @@ f"> */ i__1 = -(*info); xerbla_("DHSEQR", &i__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { /* ==== Quick return in case N = 0; nothing to do. ==== */ - return 0; + return; } else if (lquery) { @@ -951,7 +951,7 @@ f"> */ /* Computing MAX */ d__1 = (doublereal) f2cmax(1,*n); work[1] = f2cmax(d__1,work[1]); - return 0; + return; } else { @@ -982,7 +982,7 @@ f"> */ if (*ilo == *ihi) { wr[*ilo] = h__[*ilo + *ilo * h_dim1]; wi[*ilo] = 0.; - return 0; + return; } /* ==== DLAHQR/DLAQR0 crossover point ==== */ @@ -1064,6 +1064,6 @@ f"> */ /* ==== End of DHSEQR ==== */ - return 0; + return; } /* dhseqr_ */ diff --git a/lapack-netlib/SRC/dla_gbamv.c b/lapack-netlib/SRC/dla_gbamv.c index 762d9c0c7c..87d6e36bc9 100644 --- a/lapack-netlib/SRC/dla_gbamv.c +++ b/lapack-netlib/SRC/dla_gbamv.c @@ -693,7 +693,7 @@ mv.f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_gbamv_(integer *trans, integer *m, integer *n, +/* Subroutine */ void dla_gbamv_(integer *trans, integer *m, integer *n, integer *kl, integer *ku, doublereal *alpha, doublereal *ab, integer * ldab, doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) @@ -755,13 +755,13 @@ mv.f"> */ } if (info != 0) { xerbla_("DLA_GBAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -934,7 +934,7 @@ mv.f"> */ } } - return 0; + return; /* End of DLA_GBAMV */ diff --git a/lapack-netlib/SRC/dla_gbrcond.c b/lapack-netlib/SRC/dla_gbrcond.c index 6541e99c44..777b3ebd6a 100644 --- a/lapack-netlib/SRC/dla_gbrcond.c +++ b/lapack-netlib/SRC/dla_gbrcond.c @@ -694,10 +694,11 @@ doublereal dla_gbrcond_(char *trans, integer *n, integer *kl, integer *ku, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer kd, ke; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dgbtrs_( char *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal ainvnm, tmp; diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.c b/lapack-netlib/SRC/dla_gbrfsx_extended.c index 50d6019148..6fba8a5de9 100644 --- a/lapack-netlib/SRC/dla_gbrfsx_extended.c +++ b/lapack-netlib/SRC/dla_gbrfsx_extended.c @@ -922,7 +922,7 @@ fsx_extended.f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_gbrfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void dla_gbrfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *kl, integer *ku, integer *nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, integer *ipiv, logical *colequ, doublereal *c__, doublereal *b, @@ -941,21 +941,21 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__; - extern /* Subroutine */ int dla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void dla_lin_berr_(integer *, integer *, integer * , doublereal *, doublereal *, doublereal *); doublereal ymin; - extern /* Subroutine */ int blas_dgbmv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_dgbmv_x_(integer *, integer *, integer * , integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_dgbmv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_dgbmv2_x_(integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer i__, j, m; - extern /* Subroutine */ int dla_gbamv_(integer *, integer *, integer *, + extern /* Subroutine */ void dla_gbamv_(integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dgbmv_(char *, integer *, integer *, integer *, integer *, @@ -965,17 +965,17 @@ fsx_extended.f"> */ doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); char trans[1]; doublereal normx, normy, myhugeval, prev_dz_z__; extern doublereal dlamch_(char *); doublereal yk; - extern /* Subroutine */ int dgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void dgbtrs_(char *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal final_dx_x__; - extern /* Subroutine */ int dla_wwaddw_(integer *, doublereal *, + extern /* Subroutine */ void dla_wwaddw_(integer *, doublereal *, doublereal *, doublereal *); doublereal final_dz_z__, normdx; extern /* Character */ VOID chla_transtype_(char *, integer *); @@ -1024,7 +1024,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1262,6 +1262,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* dla_gbrfsx_extended__ */ diff --git a/lapack-netlib/SRC/dla_geamv.c b/lapack-netlib/SRC/dla_geamv.c index 7e0c15dfda..c522120900 100644 --- a/lapack-netlib/SRC/dla_geamv.c +++ b/lapack-netlib/SRC/dla_geamv.c @@ -682,7 +682,7 @@ mv.f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_geamv_(integer *trans, integer *m, integer *n, +/* Subroutine */ void dla_geamv_(integer *trans, integer *m, integer *n, doublereal *alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -738,13 +738,13 @@ mv.f"> */ } if (info != 0) { xerbla_("DLA_GEAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -895,7 +895,7 @@ mv.f"> */ } } - return 0; + return; /* End of DLA_GEAMV */ diff --git a/lapack-netlib/SRC/dla_gercond.c b/lapack-netlib/SRC/dla_gercond.c index 869a1e848f..3a6fa8eca7 100644 --- a/lapack-netlib/SRC/dla_gercond.c +++ b/lapack-netlib/SRC/dla_gercond.c @@ -675,11 +675,11 @@ doublereal dla_gercond_(char *trans, integer *n, doublereal *a, integer *lda, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int dgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void dgetrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.c b/lapack-netlib/SRC/dla_gerfsx_extended.c index 700c2b0af2..96959334e4 100644 --- a/lapack-netlib/SRC/dla_gerfsx_extended.c +++ b/lapack-netlib/SRC/dla_gerfsx_extended.c @@ -908,7 +908,7 @@ fsx_extended.f"> */ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_gerfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void dla_gerfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, logical *colequ, doublereal *c__, doublereal *b, integer *ldb, doublereal *y, integer * @@ -927,15 +927,15 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__; - extern /* Subroutine */ int dla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void dla_lin_berr_(integer *, integer *, integer * , doublereal *, doublereal *, doublereal *); doublereal ymin; - extern /* Subroutine */ int blas_dgemv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_dgemv_x_(integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal dxratmax, dzratmax; integer y_prec_state__, i__, j; - extern /* Subroutine */ int blas_dgemv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_dgemv2_x_(integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dla_geamv_(integer *, integer *, integer *, @@ -947,13 +947,13 @@ fsx_extended.f"> */ doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); char trans[1]; doublereal normx, normy, myhugeval, prev_dz_z__; extern doublereal dlamch_(char *); doublereal yk, final_dx_x__; - extern /* Subroutine */ int dgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void dgetrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dla_wwaddw_(integer *, doublereal *, doublereal *, doublereal *); @@ -1004,7 +1004,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1241,6 +1241,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* dla_gerfsx_extended__ */ diff --git a/lapack-netlib/SRC/dla_lin_berr.c b/lapack-netlib/SRC/dla_lin_berr.c index 5ab4cceee3..63fba703a8 100644 --- a/lapack-netlib/SRC/dla_lin_berr.c +++ b/lapack-netlib/SRC/dla_lin_berr.c @@ -610,7 +610,7 @@ _berr.f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_lin_berr_(integer *n, integer *nz, integer *nrhs, +/* Subroutine */ void dla_lin_berr_(integer *n, integer *nz, integer *nrhs, doublereal *res, doublereal *ayb, doublereal *berr) { /* System generated locals */ @@ -667,6 +667,6 @@ _berr.f"> */ } } - return 0; + return; } /* dla_lin_berr__ */ diff --git a/lapack-netlib/SRC/dla_porcond.c b/lapack-netlib/SRC/dla_porcond.c index 296cfb0991..75e3f7f7a8 100644 --- a/lapack-netlib/SRC/dla_porcond.c +++ b/lapack-netlib/SRC/dla_porcond.c @@ -665,12 +665,12 @@ doublereal dla_porcond_(char *uplo, integer *n, doublereal *a, integer *lda, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int dpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void dpotrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/dla_porfsx_extended.c b/lapack-netlib/SRC/dla_porfsx_extended.c index 5dc03574d0..482ff9cf65 100644 --- a/lapack-netlib/SRC/dla_porfsx_extended.c +++ b/lapack-netlib/SRC/dla_porfsx_extended.c @@ -899,7 +899,7 @@ fsx_extended.f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_porfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void dla_porfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal * af, integer *ldaf, logical *colequ, doublereal *c__, doublereal *b, integer *ldb, doublereal *y, integer *ldy, doublereal *berr_out__, @@ -916,16 +916,16 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__; - extern /* Subroutine */ int dla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void dla_lin_berr_(integer *, integer *, integer * , doublereal *, doublereal *, doublereal *); doublereal ymin, dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_dsymv_x_(integer *, integer *, + extern /* Subroutine */ void blas_dsymv_x_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer uplo2, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int blas_dsymv2_x_(integer *, integer *, + extern /* Subroutine */ void blas_dsymv2_x_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer * @@ -933,7 +933,7 @@ fsx_extended.f"> */ doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dla_syamv_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dsymv_(char *, @@ -942,10 +942,10 @@ fsx_extended.f"> */ doublereal normx, normy, myhugeval, prev_dz_z__; extern doublereal dlamch_(char *); doublereal yk, final_dx_x__; - extern /* Subroutine */ int dla_wwaddw_(integer *, doublereal *, + extern /* Subroutine */ void dla_wwaddw_(integer *, doublereal *, doublereal *, doublereal *); doublereal final_dz_z__, normdx; - extern /* Subroutine */ int dpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void dpotrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal prevnormdx; integer cnt; @@ -992,7 +992,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } eps = dlamch_("Epsilon"); myhugeval = dlamch_("Overflow"); @@ -1217,6 +1217,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* dla_porfsx_extended__ */ diff --git a/lapack-netlib/SRC/dla_syamv.c b/lapack-netlib/SRC/dla_syamv.c index 8b812d55c8..c696c3fab9 100644 --- a/lapack-netlib/SRC/dla_syamv.c +++ b/lapack-netlib/SRC/dla_syamv.c @@ -686,7 +686,7 @@ mv.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dla_syamv_(integer *uplo, integer *n, doublereal *alpha, +/* Subroutine */ void dla_syamv_(integer *uplo, integer *n, doublereal *alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -739,13 +739,13 @@ mv.f"> */ } if (info != 0) { xerbla_("DLA_SYAMV", &info, (ftnlen)9); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -918,7 +918,7 @@ mv.f"> */ } } - return 0; + return; /* End of DLA_SYAMV */ diff --git a/lapack-netlib/SRC/dla_syrcond.c b/lapack-netlib/SRC/dla_syrcond.c index ff1a935cf2..9466eb0f29 100644 --- a/lapack-netlib/SRC/dla_syrcond.c +++ b/lapack-netlib/SRC/dla_syrcond.c @@ -671,7 +671,7 @@ doublereal dla_syrcond_(char *uplo, integer *n, doublereal *a, integer *lda, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); logical up; @@ -679,7 +679,7 @@ doublereal dla_syrcond_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal ainvnm; char normin[1]; doublereal smlnum; - extern /* Subroutine */ int dsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.c b/lapack-netlib/SRC/dla_syrfsx_extended.c index ef3aab107c..ff90972d1d 100644 --- a/lapack-netlib/SRC/dla_syrfsx_extended.c +++ b/lapack-netlib/SRC/dla_syrfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_syrfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void dla_syrfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal * af, integer *ldaf, integer *ipiv, logical *colequ, doublereal *c__, doublereal *b, integer *ldb, doublereal *y, integer *ldy, doublereal * @@ -925,16 +925,16 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__; - extern /* Subroutine */ int dla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void dla_lin_berr_(integer *, integer *, integer * , doublereal *, doublereal *, doublereal *); doublereal ymin, dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_dsymv_x_(integer *, integer *, + extern /* Subroutine */ void blas_dsymv_x_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer uplo2, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int blas_dsymv2_x_(integer *, integer *, + extern /* Subroutine */ void blas_dsymv2_x_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer * @@ -942,10 +942,10 @@ fsx_extended.f"> */ doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dla_syamv_(integer *, integer *, doublereal * + extern /* Subroutine */ void dla_syamv_(integer *, integer *, doublereal * , doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -955,10 +955,10 @@ fsx_extended.f"> */ doublereal yk; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal final_dx_x__; - extern /* Subroutine */ int dla_wwaddw_(integer *, doublereal *, + extern /* Subroutine */ void dla_wwaddw_(integer *, doublereal *, doublereal *, doublereal *); doublereal final_dz_z__, normdx; - extern /* Subroutine */ int dsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal prevnormdx; @@ -1026,7 +1026,7 @@ fsx_extended.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLA_SYRFSX_EXTENDED", &i__1, (ftnlen)19); - return 0; + return; } eps = dlamch_("Epsilon"); myhugeval = dlamch_("Overflow"); @@ -1251,6 +1251,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* dla_syrfsx_extended__ */ diff --git a/lapack-netlib/SRC/dla_wwaddw.c b/lapack-netlib/SRC/dla_wwaddw.c index a472871b31..bdc0ff26b6 100644 --- a/lapack-netlib/SRC/dla_wwaddw.c +++ b/lapack-netlib/SRC/dla_wwaddw.c @@ -590,7 +590,7 @@ ddw.f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dla_wwaddw_(integer *n, doublereal *x, doublereal *y, +/* Subroutine */ void dla_wwaddw_(integer *n, doublereal *x, doublereal *y, doublereal *w) { /* System generated locals */ @@ -624,6 +624,6 @@ ddw.f"> */ x[i__] = s; /* L10: */ } - return 0; + return; } /* dla_wwaddw__ */ diff --git a/lapack-netlib/SRC/dlabad.c b/lapack-netlib/SRC/dlabad.c index 7f8896e9c9..f9ad54cae2 100644 --- a/lapack-netlib/SRC/dlabad.c +++ b/lapack-netlib/SRC/dlabad.c @@ -585,7 +585,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlabad_(doublereal *small, doublereal *large) +/* Subroutine */ void dlabad_(doublereal *small, doublereal *large) { /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -605,7 +605,7 @@ f"> */ *large = sqrt(*large); } - return 0; + return; /* End of DLABAD */ diff --git a/lapack-netlib/SRC/dlabrd.c b/lapack-netlib/SRC/dlabrd.c index 0607853d04..4697defb24 100644 --- a/lapack-netlib/SRC/dlabrd.c +++ b/lapack-netlib/SRC/dlabrd.c @@ -725,7 +725,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlabrd_(integer *m, integer *n, integer *nb, doublereal * +/* Subroutine */ void dlabrd_(integer *m, integer *n, integer *nb, doublereal * a, integer *lda, doublereal *d__, doublereal *e, doublereal *tauq, doublereal *taup, doublereal *x, integer *ldx, doublereal *y, integer *ldy) @@ -736,7 +736,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlarfg_(integer *, doublereal *, @@ -771,7 +771,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (*m >= *n) { @@ -1001,7 +1001,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of DLABRD */ diff --git a/lapack-netlib/SRC/dlacn2.c b/lapack-netlib/SRC/dlacn2.c index 58f8119b0a..d354699f08 100644 --- a/lapack-netlib/SRC/dlacn2.c +++ b/lapack-netlib/SRC/dlacn2.c @@ -651,7 +651,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlacn2_(integer *n, doublereal *v, doublereal *x, +/* Subroutine */ void dlacn2_(integer *n, doublereal *v, doublereal *x, integer *isgn, doublereal *est, integer *kase, integer *isave) { /* System generated locals */ @@ -663,7 +663,7 @@ f"> */ integer i__; extern doublereal dasum_(integer *, doublereal *, integer *); integer jlast; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern integer idamax_(integer *, doublereal *, integer *); doublereal altsgn, estold; @@ -693,7 +693,7 @@ f"> */ } *kase = 1; isave[1] = 1; - return 0; + return; } switch (isave[1]) { @@ -724,7 +724,7 @@ f"> */ } *kase = 2; isave[1] = 2; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -744,7 +744,7 @@ f"> */ x[isave[2]] = 1.; *kase = 1; isave[1] = 3; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -778,7 +778,7 @@ f"> */ } *kase = 2; isave[1] = 4; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 4) */ /* X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -804,7 +804,7 @@ f"> */ } *kase = 1; isave[1] = 5; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -818,7 +818,7 @@ f"> */ L150: *kase = 0; - return 0; + return; /* End of DLACN2 */ diff --git a/lapack-netlib/SRC/dlacon.c b/lapack-netlib/SRC/dlacon.c index 765098a9f8..69bb8261b4 100644 --- a/lapack-netlib/SRC/dlacon.c +++ b/lapack-netlib/SRC/dlacon.c @@ -630,7 +630,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlacon_(integer *n, doublereal *v, doublereal *x, +/* Subroutine */ void dlacon_(integer *n, doublereal *v, doublereal *x, integer *isgn, doublereal *est, integer *kase) { /* System generated locals */ @@ -643,7 +643,7 @@ f"> */ static integer jump, i__, j; extern doublereal dasum_(integer *, doublereal *, integer *); static integer jlast; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern integer idamax_(integer *, doublereal *, integer *); static doublereal altsgn, estold; @@ -672,7 +672,7 @@ f"> */ } *kase = 1; jump = 1; - return 0; + return; } switch (jump) { @@ -703,7 +703,7 @@ f"> */ } *kase = 2; jump = 2; - return 0; + return; /* ................ ENTRY (JUMP = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -723,7 +723,7 @@ f"> */ x[j] = 1.; *kase = 1; jump = 3; - return 0; + return; /* ................ ENTRY (JUMP = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -757,7 +757,7 @@ f"> */ } *kase = 2; jump = 4; - return 0; + return; /* ................ ENTRY (JUMP = 4) */ /* X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -783,7 +783,7 @@ f"> */ } *kase = 1; jump = 5; - return 0; + return; /* ................ ENTRY (JUMP = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -797,7 +797,7 @@ f"> */ L150: *kase = 0; - return 0; + return; /* End of DLACON */ diff --git a/lapack-netlib/SRC/dlacpy.c b/lapack-netlib/SRC/dlacpy.c index c3ee7c6ec9..b123a705a7 100644 --- a/lapack-netlib/SRC/dlacpy.c +++ b/lapack-netlib/SRC/dlacpy.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlacpy_(char *uplo, integer *m, integer *n, doublereal * +/* Subroutine */ void dlacpy_(char *uplo, integer *m, integer *n, doublereal * a, integer *lda, doublereal *b, integer *ldb) { /* System generated locals */ @@ -672,7 +672,7 @@ f"> */ /* L60: */ } } - return 0; + return; /* End of DLACPY */ diff --git a/lapack-netlib/SRC/dladiv.c b/lapack-netlib/SRC/dladiv.c index 7ae92a69a9..158cf1a630 100644 --- a/lapack-netlib/SRC/dladiv.c +++ b/lapack-netlib/SRC/dladiv.c @@ -602,7 +602,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dladiv_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ void dladiv_(doublereal *a, doublereal *b, doublereal *c__, doublereal *d__, doublereal *p, doublereal *q) { /* System generated locals */ @@ -612,7 +612,7 @@ f"> */ doublereal s, aa, ab, bb, cc, cd, dd, be; extern doublereal dlamch_(char *); doublereal un, ov; - extern /* Subroutine */ int dladiv1_(doublereal *, doublereal *, + extern /* Subroutine */ void dladiv1_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal eps; @@ -671,14 +671,14 @@ f"> */ *p *= s; *q *= s; - return 0; + return; /* End of DLADIV */ } /* dladiv_ */ /* > \ingroup doubleOTHERauxiliary */ -/* Subroutine */ int dladiv1_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ void dladiv1_(doublereal *a, doublereal *b, doublereal *c__, doublereal *d__, doublereal *p, doublereal *q) { doublereal r__, t; @@ -702,7 +702,7 @@ f"> */ *a = -(*a); *q = dladiv2_(b, a, c__, d__, &r__, &t); - return 0; + return; /* End of DLADIV1 */ diff --git a/lapack-netlib/SRC/dlae2.c b/lapack-netlib/SRC/dlae2.c index d62de608a7..b5784baea9 100644 --- a/lapack-netlib/SRC/dlae2.c +++ b/lapack-netlib/SRC/dlae2.c @@ -613,7 +613,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlae2_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ void dlae2_(doublereal *a, doublereal *b, doublereal *c__, doublereal *rt1, doublereal *rt2) { /* System generated locals */ @@ -683,7 +683,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rt1 = rt * .5; *rt2 = rt * -.5; } - return 0; + return; /* End of DLAE2 */ diff --git a/lapack-netlib/SRC/dlaebz.c b/lapack-netlib/SRC/dlaebz.c index c5efc98fee..801e1b4021 100644 --- a/lapack-netlib/SRC/dlaebz.c +++ b/lapack-netlib/SRC/dlaebz.c @@ -827,7 +827,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaebz_(integer *ijob, integer *nitmax, integer *n, +/* Subroutine */ void dlaebz_(integer *ijob, integer *nitmax, integer *n, integer *mmax, integer *minp, integer *nbmin, doublereal *abstol, doublereal *reltol, doublereal *pivmin, doublereal *d__, doublereal * e, doublereal *e2, integer *nval, doublereal *ab, doublereal *c__, @@ -874,7 +874,7 @@ f"> */ *info = 0; if (*ijob < 1 || *ijob > 3) { *info = -1; - return 0; + return; } /* Initialize NAB */ @@ -912,7 +912,7 @@ f"> */ *mout = *mout + nab[ji + (nab_dim1 << 1)] - nab[ji + nab_dim1]; /* L30: */ } - return 0; + return; } /* Initialize for loop */ @@ -1028,7 +1028,7 @@ f"> */ /* L70: */ } if (*info != 0) { - return 0; + return; } kl = klnew; } else { @@ -1126,7 +1126,7 @@ f"> */ nab[ji + (nab_dim1 << 1)] = itmp1; } else { *info = *mmax + 1; - return 0; + return; } } else { @@ -1216,7 +1216,7 @@ f"> */ *info = f2cmax(i__1,0); *mout = kl; - return 0; + return; /* End of DLAEBZ */ diff --git a/lapack-netlib/SRC/dlaed0.c b/lapack-netlib/SRC/dlaed0.c index 95e39b0dfc..ccd364205e 100644 --- a/lapack-netlib/SRC/dlaed0.c +++ b/lapack-netlib/SRC/dlaed0.c @@ -690,7 +690,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlaed0_(integer *icompq, integer *qsiz, integer *n, +/* Subroutine */ void dlaed0_(integer *icompq, integer *qsiz, integer *n, doublereal *d__, doublereal *e, doublereal *q, integer *ldq, doublereal *qstore, integer *ldqs, doublereal *work, integer *iwork, integer *info) @@ -702,31 +702,31 @@ f"> */ /* Local variables */ doublereal temp; integer curr, i__, j, k; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iperm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer indxq, iwrem; - extern /* Subroutine */ int dlaed1_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlaed1_(integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer iqptr; - extern /* Subroutine */ int dlaed7_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaed7_(integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *); integer tlvls, iq; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); integer igivcl; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer igivnm, submat, curprb, subpbs, igivpt; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer curlvl, matsiz, iprmpt, smlsiz, lgn, msd2, smm1, spm1, spm2; @@ -771,13 +771,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED0", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } smlsiz = ilaenv_(&c__9, "DLAED0", " ", &c__0, &c__0, &c__0, &c__0, ( @@ -827,10 +827,10 @@ f"> */ temp = log((doublereal) (*n)) / log(2.); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; @@ -996,7 +996,7 @@ f"> */ *info = submat * (*n + 1) + submat + matsiz - 1; L140: - return 0; + return; /* End of DLAED0 */ diff --git a/lapack-netlib/SRC/dlaed1.c b/lapack-netlib/SRC/dlaed1.c index 70001b449e..64d2342eb2 100644 --- a/lapack-netlib/SRC/dlaed1.c +++ b/lapack-netlib/SRC/dlaed1.c @@ -677,7 +677,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaed1_(integer *n, doublereal *d__, doublereal *q, +/* Subroutine */ void dlaed1_(integer *n, doublereal *d__, doublereal *q, integer *ldq, integer *indxq, doublereal *rho, integer *cutpnt, doublereal *work, integer *iwork, integer *info) { @@ -686,10 +686,10 @@ f"> */ /* Local variables */ integer indx, i__, k, indxc; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer indxp; - extern /* Subroutine */ int dlaed2_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaed2_(integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, integer *), dlaed3_(integer *, @@ -697,8 +697,9 @@ f"> */ doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *); integer n1, n2, idlmda, is, iw, iz; - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer coltyp, iq2, zpp1; @@ -739,13 +740,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED1", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* The following values are integer pointers which indicate */ @@ -807,7 +808,7 @@ f"> */ } L20: - return 0; + return; /* End of DLAED1 */ diff --git a/lapack-netlib/SRC/dlaed2.c b/lapack-netlib/SRC/dlaed2.c index 51ab633bd6..bdaf0e127d 100644 --- a/lapack-netlib/SRC/dlaed2.c +++ b/lapack-netlib/SRC/dlaed2.c @@ -726,7 +726,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaed2_(integer *k, integer *n, integer *n1, doublereal * +/* Subroutine */ void dlaed2_(integer *k, integer *n, integer *n1, doublereal * d__, doublereal *q, integer *ldq, integer *indxq, doublereal *rho, doublereal *z__, doublereal *dlamda, doublereal *w, doublereal *q2, integer *indx, integer *indxc, integer *indxp, integer *coltyp, @@ -738,13 +738,13 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer ctot[4]; doublereal c__; integer i__, j; doublereal s, t; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer k2, n2; @@ -753,9 +753,10 @@ f"> */ extern doublereal dlamch_(char *); integer pj, js; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, integer *, integer *, integer *), dlacpy_(char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer iq1, iq2, n1p1; doublereal eps, tau, tol; integer psm[4]; @@ -804,13 +805,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } n2 = *n - *n1; @@ -1111,7 +1112,7 @@ f"> */ } L190: - return 0; + return; /* End of DLAED2 */ diff --git a/lapack-netlib/SRC/dlaed3.c b/lapack-netlib/SRC/dlaed3.c index c2fe61d9be..b4fe8ce6cb 100644 --- a/lapack-netlib/SRC/dlaed3.c +++ b/lapack-netlib/SRC/dlaed3.c @@ -700,7 +700,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaed3_(integer *k, integer *n, integer *n1, doublereal * +/* Subroutine */ void dlaed3_(integer *k, integer *n, integer *n1, doublereal * d__, doublereal *q, integer *ldq, doublereal *rho, doublereal *dlamda, doublereal *q2, integer *indx, integer *ctot, doublereal *w, doublereal *s, integer *info) @@ -713,7 +713,7 @@ f"> */ doublereal temp; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer @@ -722,10 +722,11 @@ f"> */ integer n2; extern doublereal dlamc3_(doublereal *, doublereal *); integer n12, ii, n23; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer iq2; @@ -765,13 +766,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*k == 0) { - return 0; + return; } /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ @@ -902,7 +903,7 @@ f"> */ L120: - return 0; + return; /* End of DLAED3 */ diff --git a/lapack-netlib/SRC/dlaed4.c b/lapack-netlib/SRC/dlaed4.c index 3e0e3f8be2..13a4006565 100644 --- a/lapack-netlib/SRC/dlaed4.c +++ b/lapack-netlib/SRC/dlaed4.c @@ -654,7 +654,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaed4_(integer *n, integer *i__, doublereal *d__, +/* Subroutine */ void dlaed4_(integer *n, integer *i__, doublereal *d__, doublereal *z__, doublereal *delta, doublereal *rho, doublereal *dlam, integer *info) { @@ -670,7 +670,7 @@ f"> */ doublereal w, dltlb, dltub, midpt; integer niter; logical swtch; - extern /* Subroutine */ int dlaed5_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlaed5_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlaed6_(integer *, logical *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); @@ -712,11 +712,11 @@ f"> */ *dlam = d__[1] + *rho * z__[1] * z__[1]; delta[1] = 1.; - return 0; + return; } if (*n == 2) { dlaed5_(i__, &d__[1], &z__[1], &delta[1], rho, dlam); - return 0; + return; } /* Compute machine epsilon */ @@ -1495,7 +1495,7 @@ f"> */ L250: - return 0; + return; /* End of DLAED4 */ diff --git a/lapack-netlib/SRC/dlaed4.f b/lapack-netlib/SRC/dlaed4.f index 3ee3ef920f..b51e23d850 100644 --- a/lapack-netlib/SRC/dlaed4.f +++ b/lapack-netlib/SRC/dlaed4.f @@ -328,9 +328,12 @@ SUBROUTINE DLAED4( N, I, D, Z, DELTA, RHO, DLAM, INFO ) IF( C.LT.ZERO ) $ C = ABS( C ) IF( C.EQ.ZERO ) THEN -* ETA = B/A +* ETA = B/A * ETA = RHO - TAU - ETA = DLTUB - TAU +* ETA = DLTUB - TAU +* +* Update proposed by Li, Ren-Cang: + ETA = -W / ( DPSI+DPHI ) ELSE IF( A.GE.ZERO ) THEN ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C ) ELSE diff --git a/lapack-netlib/SRC/dlaed5.c b/lapack-netlib/SRC/dlaed5.c index f714b4ffc9..4b76eec7ba 100644 --- a/lapack-netlib/SRC/dlaed5.c +++ b/lapack-netlib/SRC/dlaed5.c @@ -617,7 +617,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaed5_(integer *i__, doublereal *d__, doublereal *z__, +/* Subroutine */ void dlaed5_(integer *i__, doublereal *d__, doublereal *z__, doublereal *delta, doublereal *rho, doublereal *dlam) { /* System generated locals */ @@ -688,7 +688,7 @@ f"> */ delta[1] /= temp; delta[2] /= temp; } - return 0; + return; /* End OF DLAED5 */ diff --git a/lapack-netlib/SRC/dlaed6.c b/lapack-netlib/SRC/dlaed6.c index 09dc390006..2e0ea1d331 100644 --- a/lapack-netlib/SRC/dlaed6.c +++ b/lapack-netlib/SRC/dlaed6.c @@ -649,7 +649,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaed6_(integer *kniter, logical *orgati, doublereal * +/* Subroutine */ void dlaed6_(integer *kniter, logical *orgati, doublereal * rho, doublereal *d__, doublereal *z__, doublereal *finit, doublereal * tau, integer *info) { @@ -928,7 +928,7 @@ f"> */ if (scale) { *tau *= sclinv; } - return 0; + return; /* End of DLAED6 */ diff --git a/lapack-netlib/SRC/dlaed7.c b/lapack-netlib/SRC/dlaed7.c index fd85152614..6826f29a59 100644 --- a/lapack-netlib/SRC/dlaed7.c +++ b/lapack-netlib/SRC/dlaed7.c @@ -775,7 +775,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlaed7_(integer *icompq, integer *n, integer *qsiz, +/* Subroutine */ void dlaed7_(integer *icompq, integer *n, integer *qsiz, integer *tlvls, integer *curlvl, integer *curpbm, doublereal *d__, doublereal *q, integer *ldq, integer *indxq, doublereal *rho, integer *cutpnt, doublereal *qstore, integer *qptr, integer *prmptr, integer * @@ -787,11 +787,11 @@ f"> */ /* Local variables */ integer indx, curr, i__, k; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer indxc, indxp, n1, n2; - extern /* Subroutine */ int dlaed8_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaed8_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, @@ -803,8 +803,9 @@ f"> */ *, doublereal *, integer *, doublereal *, doublereal *, integer *) ; integer idlmda, is, iw, iz; - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer coltyp, iq2, ptr, ldq2; @@ -852,13 +853,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED7", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* The following values are for bookkeeping purposes only. They are */ @@ -885,11 +886,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; @@ -948,7 +949,7 @@ f"> */ } L30: - return 0; + return; /* End of DLAED7 */ diff --git a/lapack-netlib/SRC/dlaed8.c b/lapack-netlib/SRC/dlaed8.c index 5224a8ca21..e18ae05e00 100644 --- a/lapack-netlib/SRC/dlaed8.c +++ b/lapack-netlib/SRC/dlaed8.c @@ -756,7 +756,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlaed8_(integer *icompq, integer *k, integer *n, integer +/* Subroutine */ void dlaed8_(integer *icompq, integer *k, integer *n, integer *qsiz, doublereal *d__, doublereal *q, integer *ldq, integer *indxq, doublereal *rho, integer *cutpnt, doublereal *z__, doublereal *dlamda, doublereal *q2, integer *ldq2, doublereal *w, integer *perm, integer @@ -769,21 +769,22 @@ f"> */ /* Local variables */ integer jlam, imax, jmax; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal c__; integer i__, j; doublereal s, t; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer k2, n1, n2; extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *); integer jp; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, integer *, integer *, integer *), dlacpy_(char *, integer *, - integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer n1p1; doublereal eps, tau, tol; @@ -837,7 +838,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED8", &i__1, (ftnlen)6); - return 0; + return; } /* Need to initialize GIVPTR to O here in case of quick exit */ @@ -850,7 +851,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } n1 = *cutpnt; @@ -924,7 +925,7 @@ f"> */ } dlacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq); } - return 0; + return; } /* If there are multiple eigenvalues then the problem deflates. Here */ @@ -1075,7 +1076,7 @@ f"> */ } } - return 0; + return; /* End of DLAED8 */ diff --git a/lapack-netlib/SRC/dlaed9.c b/lapack-netlib/SRC/dlaed9.c index 9e2bbde183..a2ccbd483f 100644 --- a/lapack-netlib/SRC/dlaed9.c +++ b/lapack-netlib/SRC/dlaed9.c @@ -669,7 +669,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlaed9_(integer *k, integer *kstart, integer *kstop, +/* Subroutine */ void dlaed9_(integer *k, integer *kstart, integer *kstop, integer *n, doublereal *d__, doublereal *q, integer *ldq, doublereal * rho, doublereal *dlamda, doublereal *w, doublereal *s, integer *lds, integer *info) @@ -682,7 +682,7 @@ f"> */ doublereal temp; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dlaed4_(integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); @@ -731,13 +731,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAED9", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*k == 0) { - return 0; + return; } /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ @@ -837,7 +837,7 @@ f"> */ } L120: - return 0; + return; /* End of DLAED9 */ diff --git a/lapack-netlib/SRC/dlaeda.c b/lapack-netlib/SRC/dlaeda.c index f4bb214d3c..fc992d8799 100644 --- a/lapack-netlib/SRC/dlaeda.c +++ b/lapack-netlib/SRC/dlaeda.c @@ -682,7 +682,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlaeda_(integer *n, integer *tlvls, integer *curlvl, +/* Subroutine */ void dlaeda_(integer *n, integer *tlvls, integer *curlvl, integer *curpbm, integer *prmptr, integer *perm, integer *givptr, integer *givcol, doublereal *givnum, doublereal *q, integer *qptr, doublereal *z__, doublereal *ztemp, integer *info) @@ -691,14 +691,14 @@ f"> */ integer i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer curr, bsiz1, bsiz2, psiz1, psiz2, i__, k, zptr1; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, - doublereal *, integer *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer mid, ptr; @@ -733,13 +733,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAEDA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine location of first number in second half. */ @@ -754,7 +754,7 @@ f"> */ /* scheme */ i__1 = *curlvl - 1; - curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1; + curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1; /* Determine size of these matrices. We add HALF to the value of */ /* the SQRT in case the machine underestimates one of these square */ @@ -781,12 +781,12 @@ f"> */ /* rotations and permutation and then multiplying the center matrices */ /* against the current Z. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (k = 1; k <= i__1; ++k) { i__2 = *curlvl - k; i__3 = *curlvl - k - 1; - curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - + curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 1; psiz1 = prmptr[curr + 1] - prmptr[curr]; psiz2 = prmptr[curr + 2] - prmptr[curr + 1]; @@ -847,11 +847,11 @@ f"> */ c__1); i__2 = *tlvls - k; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L70: */ } - return 0; + return; /* End of DLAEDA */ diff --git a/lapack-netlib/SRC/dlaein.c b/lapack-netlib/SRC/dlaein.c index 93134f4b7d..009df2142b 100644 --- a/lapack-netlib/SRC/dlaein.c +++ b/lapack-netlib/SRC/dlaein.c @@ -685,7 +685,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaein_(logical *rightv, logical *noinit, integer *n, +/* Subroutine */ void dlaein_(logical *rightv, logical *noinit, integer *n, doublereal *h__, integer *ldh, doublereal *wr, doublereal *wi, doublereal *vr, doublereal *vi, doublereal *b, integer *ldb, doublereal *work, doublereal *eps3, doublereal *smlnum, doublereal * @@ -700,7 +700,7 @@ f"> */ doublereal temp, norm, vmax; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal scale, w, x, y; extern doublereal dasum_(integer *, doublereal *, integer *); @@ -711,10 +711,10 @@ f"> */ extern doublereal dlapy2_(doublereal *, doublereal *); doublereal ei, ej, absbii, absbjj, xi; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dladiv_(doublereal *, doublereal *, + extern /* Subroutine */ void dladiv_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal xr; - extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatrs_(char *, char *, char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); char normin[1]; @@ -1252,7 +1252,7 @@ f"> */ } - return 0; + return; /* End of DLAEIN */ diff --git a/lapack-netlib/SRC/dlaev2.c b/lapack-netlib/SRC/dlaev2.c index 8140108f6e..685a62fa8f 100644 --- a/lapack-netlib/SRC/dlaev2.c +++ b/lapack-netlib/SRC/dlaev2.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaev2_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ void dlaev2_(doublereal *a, doublereal *b, doublereal *c__, doublereal *rt1, doublereal *rt2, doublereal *cs1, doublereal *sn1) { /* System generated locals */ @@ -735,7 +735,7 @@ f"> */ *cs1 = -(*sn1); *sn1 = tn; } - return 0; + return; /* End of DLAEV2 */ diff --git a/lapack-netlib/SRC/dlaexc.c b/lapack-netlib/SRC/dlaexc.c index 38c73491a1..be0c45dd9d 100644 --- a/lapack-netlib/SRC/dlaexc.c +++ b/lapack-netlib/SRC/dlaexc.c @@ -656,7 +656,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaexc_(logical *wantq, integer *n, doublereal *t, +/* Subroutine */ void dlaexc_(logical *wantq, integer *n, doublereal *t, integer *ldt, doublereal *q, integer *ldq, integer *j1, integer *n1, integer *n2, doublereal *work, integer *info) { @@ -667,14 +667,14 @@ f"> */ /* Local variables */ integer ierr; doublereal temp; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal d__[16] /* was [4][4] */; integer k; doublereal u[3], scale, x[4] /* was [2][2] */, dnorm; integer j2, j3, j4; doublereal xnorm, u1[3], u2[3]; - extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlanv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlasy2_( logical *, logical *, integer *, integer *, integer *, doublereal @@ -686,10 +686,10 @@ f"> */ doublereal t33; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); doublereal sn; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlarfx_(char *, integer *, integer *, doublereal *, @@ -721,10 +721,10 @@ f"> */ /* Quick return if possible */ if (*n == 0 || *n1 == 0 || *n2 == 0) { - return 0; + return; } if (*j1 + *n1 > *n) { - return 0; + return; } j2 = *j1 + 1; @@ -1010,13 +1010,13 @@ f"> */ } } - return 0; + return; /* Exit with INFO = 1 if swap was rejected. */ L50: *info = 1; - return 0; + return; /* End of DLAEXC */ diff --git a/lapack-netlib/SRC/dlag2.c b/lapack-netlib/SRC/dlag2.c index e466fc876f..bddbf647bc 100644 --- a/lapack-netlib/SRC/dlag2.c +++ b/lapack-netlib/SRC/dlag2.c @@ -665,7 +665,7 @@ ssary to avoid over-/underflow. */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlag2_(doublereal *a, integer *lda, doublereal *b, +/* Subroutine */ void dlag2_(doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *safmin, doublereal *scale1, doublereal * scale2, doublereal *wr1, doublereal *wr2, doublereal *wi) { @@ -914,6 +914,6 @@ ssary to avoid over-/underflow. */ /* End of DLAG2 */ - return 0; + return; } /* dlag2_ */ diff --git a/lapack-netlib/SRC/dlag2s.c b/lapack-netlib/SRC/dlag2s.c index 8cbfb62e9d..83f47eab7d 100644 --- a/lapack-netlib/SRC/dlag2s.c +++ b/lapack-netlib/SRC/dlag2s.c @@ -617,7 +617,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlag2s_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dlag2s_(integer *m, integer *n, doublereal *a, integer * lda, real *sa, integer *ldsa, integer *info) { /* System generated locals */ @@ -663,7 +663,7 @@ f"> */ } *info = 0; L30: - return 0; + return; /* End of DLAG2S */ diff --git a/lapack-netlib/SRC/dlag2s.f b/lapack-netlib/SRC/dlag2s.f index e5a9302238..9e6dead49e 100644 --- a/lapack-netlib/SRC/dlag2s.f +++ b/lapack-netlib/SRC/dlag2s.f @@ -34,8 +34,8 @@ *> *> \verbatim *> -*> DLAG2S converts a DOUBLE PRECISION matrix, SA, to a SINGLE -*> PRECISION matrix, A. +*> DLAG2S converts a DOUBLE PRECISION matrix, A, to a SINGLE +*> PRECISION matrix, SA. *> *> RMAX is the overflow for the SINGLE PRECISION arithmetic *> DLAG2S checks that all the entries of A are between -RMAX and @@ -128,6 +128,9 @@ SUBROUTINE DLAG2S( M, N, A, LDA, SA, LDSA, INFO ) REAL SLAMCH EXTERNAL SLAMCH * .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. * .. Executable Statements .. * RMAX = SLAMCH( 'O' ) @@ -137,7 +140,7 @@ SUBROUTINE DLAG2S( M, N, A, LDA, SA, LDSA, INFO ) INFO = 1 GO TO 30 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 10 CONTINUE 20 CONTINUE INFO = 0 diff --git a/lapack-netlib/SRC/dlags2.c b/lapack-netlib/SRC/dlags2.c index a4bd98b80b..023660b914 100644 --- a/lapack-netlib/SRC/dlags2.c +++ b/lapack-netlib/SRC/dlags2.c @@ -663,7 +663,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlags2_(logical *upper, doublereal *a1, doublereal *a2, +/* Subroutine */ void dlags2_(logical *upper, doublereal *a1, doublereal *a2, doublereal *a3, doublereal *b1, doublereal *b2, doublereal *b3, doublereal *csu, doublereal *snu, doublereal *csv, doublereal *snv, doublereal *csq, doublereal *snq) @@ -674,7 +674,7 @@ f"> */ /* Local variables */ doublereal aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, ua11r, ua22r, vb11r, vb22r, a, b, c__, d__, r__, s1, s2; - extern /* Subroutine */ int dlasv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlasv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -867,7 +867,7 @@ f"> */ } - return 0; + return; /* End of DLAGS2 */ diff --git a/lapack-netlib/SRC/dlagtf.c b/lapack-netlib/SRC/dlagtf.c index 3d0e92bf54..1d55872e48 100644 --- a/lapack-netlib/SRC/dlagtf.c +++ b/lapack-netlib/SRC/dlagtf.c @@ -666,7 +666,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlagtf_(integer *n, doublereal *a, doublereal *lambda, +/* Subroutine */ void dlagtf_(integer *n, doublereal *a, doublereal *lambda, doublereal *b, doublereal *c__, doublereal *tol, doublereal *d__, integer *in, integer *info) { @@ -706,11 +706,11 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("DLAGTF", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } a[1] -= *lambda; @@ -719,7 +719,7 @@ f"> */ if (a[1] == 0.) { in[1] = 1; } - return 0; + return; } eps = dlamch_("Epsilon"); @@ -778,7 +778,7 @@ f"> */ in[*n] = *n; } - return 0; + return; /* End of DLAGTF */ diff --git a/lapack-netlib/SRC/dlagtm.c b/lapack-netlib/SRC/dlagtm.c index ce7364ab04..7bce48edd4 100644 --- a/lapack-netlib/SRC/dlagtm.c +++ b/lapack-netlib/SRC/dlagtm.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlagtm_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void dlagtm_(char *trans, integer *n, integer *nrhs, doublereal *alpha, doublereal *dl, doublereal *d__, doublereal *du, doublereal *x, integer *ldx, doublereal *beta, doublereal *b, integer *ldb) @@ -689,7 +689,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } /* Multiply B by BETA if BETA.NE.1. */ @@ -821,7 +821,7 @@ f"> */ } } } - return 0; + return; /* End of DLAGTM */ diff --git a/lapack-netlib/SRC/dlagts.c b/lapack-netlib/SRC/dlagts.c index 998b0cddef..31cac23064 100644 --- a/lapack-netlib/SRC/dlagts.c +++ b/lapack-netlib/SRC/dlagts.c @@ -671,7 +671,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlagts_(integer *job, integer *n, doublereal *a, +/* Subroutine */ void dlagts_(integer *job, integer *n, doublereal *a, doublereal *b, doublereal *c__, doublereal *d__, integer *in, doublereal *y, doublereal *tol, integer *info) { @@ -715,11 +715,11 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAGTS", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } eps = dlamch_("Epsilon"); @@ -778,14 +778,14 @@ f"> */ if (absak < sfmin) { if (absak == 0. || abs(temp) * sfmin > absak) { *info = k; - return 0; + return; } else { temp *= bignum; ak *= bignum; } } else if (abs(temp) > absak * bignum) { *info = k; - return 0; + return; } } y[k] = temp / ak; @@ -844,14 +844,14 @@ f"> */ if (absak < sfmin) { if (absak == 0. || abs(temp) * sfmin > absak) { *info = k; - return 0; + return; } else { temp *= bignum; ak *= bignum; } } else if (abs(temp) > absak * bignum) { *info = k; - return 0; + return; } } y[k] = temp / ak; @@ -906,6 +906,6 @@ f"> */ /* End of DLAGTS */ - return 0; + return; } /* dlagts_ */ diff --git a/lapack-netlib/SRC/dlagv2.c b/lapack-netlib/SRC/dlagv2.c index 19d838aed5..e89877f5e5 100644 --- a/lapack-netlib/SRC/dlagv2.c +++ b/lapack-netlib/SRC/dlagv2.c @@ -671,7 +671,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int dlagv2_(doublereal *a, integer *lda, doublereal *b, +/* Subroutine */ void dlagv2_(doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *alphar, doublereal *alphai, doublereal * beta, doublereal *csl, doublereal *snl, doublereal *csr, doublereal * snr) @@ -681,20 +681,20 @@ f"> */ doublereal d__1, d__2, d__3, d__4, d__5, d__6; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *), dlag2_( doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal r__, t, anorm, bnorm, h1, h2, h3, scale1, scale2; - extern /* Subroutine */ int dlasv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlasv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); extern doublereal dlapy2_(doublereal *, doublereal *); doublereal ascale, bscale; extern doublereal dlamch_(char *); doublereal wi, qq, rr, safmin; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal wr1, wr2, ulp; @@ -913,7 +913,7 @@ f"> */ beta[2] = 1.; } - return 0; + return; /* End of DLAGV2 */ diff --git a/lapack-netlib/SRC/dlahqr.c b/lapack-netlib/SRC/dlahqr.c index a27f823090..6b78231cc5 100644 --- a/lapack-netlib/SRC/dlahqr.c +++ b/lapack-netlib/SRC/dlahqr.c @@ -720,7 +720,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlahqr_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void dlahqr_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal *wr, doublereal *wi, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz, integer *info) @@ -730,23 +730,23 @@ f"> */ doublereal d__1, d__2, d__3, d__4; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer i__, j, k, l, m; doublereal s, v[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer itmax, i1, i2; doublereal t1, t2, t3, v2, v3; - extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlanv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal aa, ab, ba, bb; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal h11, h12, h21, h22, cs; integer nh; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); doublereal sn; integer nr; @@ -782,12 +782,12 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*ilo == *ihi) { wr[*ilo] = h__[*ilo + *ilo * h_dim1]; wi[*ilo] = 0.; - return 0; + return; } /* ==== clear out the trash ==== */ @@ -1156,7 +1156,7 @@ f"> */ /* Failure to converge in remaining number of iterations */ *info = i__; - return 0; + return; L150: @@ -1206,7 +1206,7 @@ f"> */ goto L20; L160: - return 0; + return; /* End of DLAHQR */ diff --git a/lapack-netlib/SRC/dlahr2.c b/lapack-netlib/SRC/dlahr2.c index 92fbb38b2f..c4593e671f 100644 --- a/lapack-netlib/SRC/dlahr2.c +++ b/lapack-netlib/SRC/dlahr2.c @@ -700,7 +700,7 @@ f"> */ /* > Mathematical Software, 32(2):180-194, June 2006. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlahr2_(integer *n, integer *k, integer *nb, doublereal * +/* Subroutine */ void dlahr2_(integer *n, integer *k, integer *nb, doublereal * a, integer *lda, doublereal *tau, doublereal *t, integer *ldt, doublereal *y, integer *ldy) { @@ -711,7 +711,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dgemv_( @@ -724,7 +724,7 @@ f"> */ dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal ei; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -754,7 +754,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -882,7 +882,7 @@ f"> */ dtrmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b5, &t[ t_offset], ldt, &y[y_offset], ldy); - return 0; + return; /* End of DLAHR2 */ diff --git a/lapack-netlib/SRC/dlaic1.c b/lapack-netlib/SRC/dlaic1.c index a18af5a1fa..5aff47ddaf 100644 --- a/lapack-netlib/SRC/dlaic1.c +++ b/lapack-netlib/SRC/dlaic1.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaic1_(integer *job, integer *j, doublereal *x, +/* Subroutine */ void dlaic1_(integer *job, integer *j, doublereal *x, doublereal *sest, doublereal *w, doublereal *gamma, doublereal * sestpr, doublereal *s, doublereal *c__) { @@ -704,7 +704,7 @@ f"> */ *c__ /= tmp; *sestpr = s1 * tmp; } - return 0; + return; } else if (absgam <= eps * absest) { *s = 1.; *c__ = 0.; @@ -712,7 +712,7 @@ f"> */ s1 = absest / tmp; s2 = absalp / tmp; *sestpr = tmp * sqrt(s1 * s1 + s2 * s2); - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -725,7 +725,7 @@ f"> */ *c__ = 1.; *sestpr = s1; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -742,7 +742,7 @@ f"> */ *s = alpha / s1 / *c__; *c__ = d_sign(&c_b5, gamma) / *c__; } - return 0; + return; } else { /* normal case */ @@ -764,7 +764,7 @@ f"> */ *s = sine / tmp; *c__ = cosine / tmp; *sestpr = sqrt(t + 1.) * absest; - return 0; + return; } } else if (*job == 2) { @@ -790,12 +790,12 @@ f"> */ tmp = sqrt(*s * *s + *c__ * *c__); *s /= tmp; *c__ /= tmp; - return 0; + return; } else if (absgam <= eps * absest) { *s = 0.; *c__ = 1.; *sestpr = absgam; - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -808,7 +808,7 @@ f"> */ *c__ = 0.; *sestpr = s2; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -825,7 +825,7 @@ f"> */ *c__ = alpha / s1 / *s; *s = -d_sign(&c_b5, gamma) / *s; } - return 0; + return; } else { /* normal case */ @@ -869,11 +869,11 @@ f"> */ tmp = sqrt(sine * sine + cosine * cosine); *s = sine / tmp; *c__ = cosine / tmp; - return 0; + return; } } - return 0; + return; /* End of DLAIC1 */ diff --git a/lapack-netlib/SRC/dlaln2.c b/lapack-netlib/SRC/dlaln2.c index 624ea2f98a..9964063b83 100644 --- a/lapack-netlib/SRC/dlaln2.c +++ b/lapack-netlib/SRC/dlaln2.c @@ -726,7 +726,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaln2_(logical *ltrans, integer *na, integer *nw, +/* Subroutine */ void dlaln2_(logical *ltrans, integer *na, integer *nw, doublereal *smin, doublereal *ca, doublereal *a, integer *lda, doublereal *d1, doublereal *d2, doublereal *b, integer *ldb, doublereal *wr, doublereal *wi, doublereal *x, integer *ldx, @@ -753,7 +753,7 @@ f"> */ #define ci (equiv_0) #define cr (equiv_1) extern doublereal dlamch_(char *); - extern /* Subroutine */ int dladiv_(doublereal *, doublereal *, + extern /* Subroutine */ void dladiv_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal bignum, bi1, bi2, br1, br2, smlnum, xi1, xi2, xr1, xr2, ci21, ci22, cr21, cr22, li21, csi, ui11, lr21, ui12, ui22; @@ -920,7 +920,7 @@ f"> */ x[x_dim1 + 2] = temp * b[b_dim1 + 2]; *xnorm = temp * bnorm; *info = 1; - return 0; + return; } /* Gaussian elimination with complete pivoting. */ @@ -1023,7 +1023,7 @@ f"> */ x[(x_dim1 << 1) + 2] = temp * b[(b_dim1 << 1) + 2]; *xnorm = temp * bnorm; *info = 1; - return 0; + return; } /* Gaussian elimination with complete pivoting. */ @@ -1142,7 +1142,7 @@ f"> */ } } - return 0; + return; /* End of DLALN2 */ diff --git a/lapack-netlib/SRC/dlals0.c b/lapack-netlib/SRC/dlals0.c index 7fa2ff247e..3fb8b931e9 100644 --- a/lapack-netlib/SRC/dlals0.c +++ b/lapack-netlib/SRC/dlals0.c @@ -783,7 +783,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int dlals0_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void dlals0_(integer *icompq, integer *nl, integer *nr, integer *sqre, integer *nrhs, doublereal *b, integer *ldb, doublereal *bx, integer *ldbx, integer *perm, integer *givptr, integer *givcol, integer *ldgcol, doublereal *givnum, integer *ldgnum, doublereal * @@ -798,24 +798,24 @@ f"> */ /* Local variables */ doublereal temp; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j, m, n; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal diflj, difrj, dsigj; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamc3_(doublereal *, doublereal *); doublereal dj; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer - *, doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + *, doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal dsigjp; integer nlp1; @@ -886,7 +886,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLALS0", &i__1, (ftnlen)6); - return 0; + return; } m = n + *sqre; @@ -1070,7 +1070,7 @@ f"> */ } } - return 0; + return; /* End of DLALS0 */ diff --git a/lapack-netlib/SRC/dlalsa.c b/lapack-netlib/SRC/dlalsa.c index 891ed66a8b..2d20aaf8fa 100644 --- a/lapack-netlib/SRC/dlalsa.c +++ b/lapack-netlib/SRC/dlalsa.c @@ -779,7 +779,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int dlalsa_(integer *icompq, integer *smlsiz, integer *n, +/* Subroutine */ void dlalsa_(integer *icompq, integer *smlsiz, integer *n, integer *nrhs, doublereal *b, integer *ldb, doublereal *bx, integer * ldbx, doublereal *u, integer *ldu, doublereal *vt, integer *k, doublereal *difl, doublereal *difr, doublereal *z__, doublereal * @@ -796,23 +796,23 @@ f"> */ /* Local variables */ integer nlvl, sqre, i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer inode, ndiml, ndimr; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer i1; - extern /* Subroutine */ int dlals0_(integer *, integer *, integer *, + extern /* Subroutine */ void dlals0_(integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer ic, lf, nd, ll, nl, nr; - extern /* Subroutine */ int dlasdt_(integer *, integer *, integer *, - integer *, integer *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlasdt_(integer *, integer *, integer *, + integer *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer im1, nlf, nrf, lvl, ndb1, nlp1, lvl2, nrp1; @@ -891,7 +891,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLALSA", &i__1, (ftnlen)6); - return 0; + return; } /* Book-keeping and setting up the computation tree. */ @@ -951,7 +951,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -965,7 +965,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1010,7 +1010,7 @@ f"> */ ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; @@ -1067,7 +1067,7 @@ f"> */ L90: - return 0; + return; /* End of DLALSA */ diff --git a/lapack-netlib/SRC/dlalsd.c b/lapack-netlib/SRC/dlalsd.c index 527810599f..6be6155b72 100644 --- a/lapack-netlib/SRC/dlalsd.c +++ b/lapack-netlib/SRC/dlalsd.c @@ -694,7 +694,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int dlalsd_(char *uplo, integer *smlsiz, integer *n, integer +/* Subroutine */ void dlalsd_(char *uplo, integer *smlsiz, integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublereal *b, integer *ldb, doublereal *rcond, integer *rank, doublereal *work, integer *iwork, integer *info) @@ -707,53 +707,53 @@ f"> */ integer difl, difr; doublereal rcnd; integer perm, nsub; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer nlvl, sqre, bxst, c__, i__, j, k; doublereal r__; integer s, u; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer z__; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer poles, sizei, nsize, nwork, icmpq1, icmpq2; doublereal cs; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlasda_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasda_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer bx; - extern /* Subroutine */ int dlalsa_(integer *, integer *, integer *, + extern /* Subroutine */ void dlalsa_(integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); integer st; - extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void dlasdq_(char *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer vt; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlaset_(char *, integer *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer givcol; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlasrt_(char *, integer *, doublereal *, integer *); doublereal orgnrm; integer givnum, givptr, nm1, smlszp, st1; @@ -795,7 +795,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLALSD", &i__1, (ftnlen)6); - return 0; + return; } eps = dlamch_("Epsilon"); @@ -813,7 +813,7 @@ f"> */ /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } else if (*n == 1) { if (d__[1] == 0.) { dlaset_("A", &c__1, nrhs, &c_b6, &c_b6, &b[b_offset], ldb); @@ -823,7 +823,7 @@ f"> */ b_offset], ldb, info); d__[1] = abs(d__[1]); } - return 0; + return; } /* Rotate the matrix if it is lower bidiagonal. */ @@ -866,7 +866,7 @@ f"> */ orgnrm = dlanst_("M", n, &d__[1], &e[1]); if (orgnrm == 0.) { dlaset_("A", n, nrhs, &c_b6, &c_b6, &b[b_offset], ldb); - return 0; + return; } dlascl_("G", &c__0, &c__0, &orgnrm, &c_b11, n, &c__1, &d__[1], n, info); @@ -882,7 +882,7 @@ f"> */ dlasdq_("U", &c__0, n, n, &c__0, nrhs, &d__[1], &e[1], &work[1], n, & work[1], n, &b[b_offset], ldb, &work[nwork], info); if (*info != 0) { - return 0; + return; } tol = rcnd * (d__1 = d__[idamax_(n, &d__[1], &c__1)], abs(d__1)); i__1 = *n; @@ -908,7 +908,7 @@ f"> */ dlascl_("G", &c__0, &c__0, &orgnrm, &c_b11, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; } /* Book-keeping and setting up some constants. */ @@ -1002,7 +1002,7 @@ f"> */ st], &work[vt + st1], n, &work[nwork], n, &b[st + b_dim1], ldb, &work[nwork], info); if (*info != 0) { - return 0; + return; } dlacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n); @@ -1018,7 +1018,7 @@ f"> */ st1], &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } bxst = bx + st1; dlalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, & @@ -1029,7 +1029,7 @@ f"> */ work[givnum + st1], &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } } st = i__ + 1; @@ -1081,7 +1081,7 @@ f"> */ &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[ iwk], info); if (*info != 0) { - return 0; + return; } } /* L80: */ @@ -1094,7 +1094,7 @@ f"> */ dlascl_("G", &c__0, &c__0, &orgnrm, &c_b11, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; /* End of DLALSD */ diff --git a/lapack-netlib/SRC/dlamrg.c b/lapack-netlib/SRC/dlamrg.c index 98a63c822e..87eab4d7f6 100644 --- a/lapack-netlib/SRC/dlamrg.c +++ b/lapack-netlib/SRC/dlamrg.c @@ -609,7 +609,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlamrg_(integer *n1, integer *n2, doublereal *a, integer +/* Subroutine */ void dlamrg_(integer *n1, integer *n2, doublereal *a, integer *dtrd1, integer *dtrd2, integer *index) { /* System generated locals */ @@ -682,7 +682,7 @@ f"> */ } } - return 0; + return; /* End of DLAMRG */ diff --git a/lapack-netlib/SRC/dlamswlq.c b/lapack-netlib/SRC/dlamswlq.c index 4db85c57c6..5d63419ece 100644 --- a/lapack-netlib/SRC/dlamswlq.c +++ b/lapack-netlib/SRC/dlamswlq.c @@ -713,7 +713,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlamswlq_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void dlamswlq_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *lwork, integer *info) @@ -731,7 +731,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int dgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemlqt_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dtpmlqt_(char *, char *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, @@ -799,10 +799,10 @@ static integer c__0 = 0; i__1 = -(*info); xerbla_("DLAMSWLQ", &i__1, (ftnlen)8); work[1] = (doublereal) lw; - return 0; + return; } else if (lquery) { work[1] = (doublereal) lw; - return 0; + return; } /* Quick return if possible */ @@ -810,7 +810,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -818,7 +818,7 @@ static integer c__0 = 0; if (*nb <= *k || *nb >= f2cmax(i__1,*k)) { dgemlqt_(side, trans, m, n, k, mb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && tran) { @@ -961,7 +961,7 @@ static integer c__0 = 0; } work[1] = (doublereal) lw; - return 0; + return; /* End of DLAMSWLQ */ diff --git a/lapack-netlib/SRC/dlamtsqr.c b/lapack-netlib/SRC/dlamtsqr.c index 99bb5748f5..ab6e329ae7 100644 --- a/lapack-netlib/SRC/dlamtsqr.c +++ b/lapack-netlib/SRC/dlamtsqr.c @@ -706,7 +706,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlamtsqr_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void dlamtsqr_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *lwork, integer *info) @@ -724,7 +724,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int dgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemqrt_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dtpmqrt_(char *, char *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, @@ -797,9 +797,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("DLAMTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -807,7 +807,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -815,7 +815,7 @@ static integer c__0 = 0; if (*mb <= *k || *mb >= f2cmax(i__1,*k)) { dgemqrt_(side, trans, m, n, k, nb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && notran) { @@ -959,7 +959,7 @@ static integer c__0 = 0; } work[1] = (doublereal) lw; - return 0; + return; /* End of DLAMTSQR */ diff --git a/lapack-netlib/SRC/dlangb.c b/lapack-netlib/SRC/dlangb.c index 82673d7402..5503198fa7 100644 --- a/lapack-netlib/SRC/dlangb.c +++ b/lapack-netlib/SRC/dlangb.c @@ -646,12 +646,12 @@ doublereal dlangb_(char *norm, integer *n, integer *kl, integer *ku, /* Local variables */ doublereal temp; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k, l; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlange.c b/lapack-netlib/SRC/dlange.c index 115817e08a..70b743c64c 100644 --- a/lapack-netlib/SRC/dlange.c +++ b/lapack-netlib/SRC/dlange.c @@ -637,12 +637,12 @@ doublereal dlange_(char *norm, integer *m, integer *n, doublereal *a, integer /* Local variables */ doublereal temp; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlangt.c b/lapack-netlib/SRC/dlangt.c index 95918e22ea..2dd791d906 100644 --- a/lapack-netlib/SRC/dlangt.c +++ b/lapack-netlib/SRC/dlangt.c @@ -634,7 +634,7 @@ doublereal dlangt_(char *norm, integer *n, doublereal *dl, doublereal *d__, extern logical lsame_(char *, char *); doublereal anorm; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal sum; diff --git a/lapack-netlib/SRC/dlanhs.c b/lapack-netlib/SRC/dlanhs.c index 39a67b1f76..a1ddef031a 100644 --- a/lapack-netlib/SRC/dlanhs.c +++ b/lapack-netlib/SRC/dlanhs.c @@ -630,12 +630,12 @@ doublereal dlanhs_(char *norm, integer *n, doublereal *a, integer *lda, doublereal ret_val, d__1; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlansb.c b/lapack-netlib/SRC/dlansb.c index 3e02800242..787679ef93 100644 --- a/lapack-netlib/SRC/dlansb.c +++ b/lapack-netlib/SRC/dlansb.c @@ -651,12 +651,12 @@ doublereal dlansb_(char *norm, char *uplo, integer *n, integer *k, doublereal /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, l; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlansf.c b/lapack-netlib/SRC/dlansf.c index 6ddb002e18..7563b7be98 100644 --- a/lapack-netlib/SRC/dlansf.c +++ b/lapack-netlib/SRC/dlansf.c @@ -739,7 +739,7 @@ doublereal dlansf_(char *norm, char *transr, char *uplo, integer *n, integer n1; doublereal aa; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); integer lda, ifm, noe, ilu; diff --git a/lapack-netlib/SRC/dlansp.c b/lapack-netlib/SRC/dlansp.c index 311166b64e..fcf0e02d2c 100644 --- a/lapack-netlib/SRC/dlansp.c +++ b/lapack-netlib/SRC/dlansp.c @@ -637,12 +637,12 @@ doublereal dlansp_(char *norm, char *uplo, integer *n, doublereal *ap, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlanst.c b/lapack-netlib/SRC/dlanst.c index 1d45e5e741..b31ca3fdca 100644 --- a/lapack-netlib/SRC/dlanst.c +++ b/lapack-netlib/SRC/dlanst.c @@ -626,7 +626,7 @@ doublereal dlanst_(char *norm, integer *n, doublereal *d__, doublereal *e) extern logical lsame_(char *, char *); doublereal anorm; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal sum; diff --git a/lapack-netlib/SRC/dlansy.c b/lapack-netlib/SRC/dlansy.c index 42df671289..d001d86206 100644 --- a/lapack-netlib/SRC/dlansy.c +++ b/lapack-netlib/SRC/dlansy.c @@ -645,12 +645,12 @@ doublereal dlansy_(char *norm, char *uplo, integer *n, doublereal *a, integer /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlantb.c b/lapack-netlib/SRC/dlantb.c index fddff35de1..9c7c50f906 100644 --- a/lapack-netlib/SRC/dlantb.c +++ b/lapack-netlib/SRC/dlantb.c @@ -661,13 +661,13 @@ doublereal dlantb_(char *norm, char *uplo, char *diag, integer *n, integer *k, doublereal ret_val, d__1; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, l; logical udiag; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlantp.c b/lapack-netlib/SRC/dlantp.c index d920196346..1733bd85ff 100644 --- a/lapack-netlib/SRC/dlantp.c +++ b/lapack-netlib/SRC/dlantp.c @@ -646,13 +646,13 @@ doublereal dlantp_(char *norm, char *uplo, char *diag, integer *n, doublereal doublereal ret_val, d__1; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k; logical udiag; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlantr.c b/lapack-netlib/SRC/dlantr.c index 75e790cc55..20a40dc57d 100644 --- a/lapack-netlib/SRC/dlantr.c +++ b/lapack-netlib/SRC/dlantr.c @@ -662,13 +662,13 @@ doublereal dlantr_(char *norm, char *uplo, char *diag, integer *m, integer *n, doublereal ret_val, d__1; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; logical udiag; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal colssq[2], sum, ssq[2]; diff --git a/lapack-netlib/SRC/dlanv2.c b/lapack-netlib/SRC/dlanv2.c index 8074d0b939..b0254077c1 100644 --- a/lapack-netlib/SRC/dlanv2.c +++ b/lapack-netlib/SRC/dlanv2.c @@ -643,7 +643,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlanv2_(doublereal *a, doublereal *b, doublereal *c__, +/* Subroutine */ void dlanv2_(doublereal *a, doublereal *b, doublereal *c__, doublereal *d__, doublereal *rt1r, doublereal *rt1i, doublereal *rt2r, doublereal *rt2i, doublereal *cs, doublereal *sn) { @@ -826,7 +826,7 @@ f"> */ *rt1i = sqrt((abs(*b))) * sqrt((abs(*c__))); *rt2i = -(*rt1i); } - return 0; + return; /* End of DLANV2 */ diff --git a/lapack-netlib/SRC/dlaorhr_col_getrfnp.c b/lapack-netlib/SRC/dlaorhr_col_getrfnp.c index efc394f2ab..e716a0efae 100644 --- a/lapack-netlib/SRC/dlaorhr_col_getrfnp.c +++ b/lapack-netlib/SRC/dlaorhr_col_getrfnp.c @@ -662,21 +662,21 @@ _col_getrfnp.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dlaorhr_col_getrfnp_(integer *m, integer *n, doublereal +/* Subroutine */ void dlaorhr_col_getrfnp_(integer *m, integer *n, doublereal *a, integer *lda, doublereal *d__, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ - extern /* Subroutine */ int dlaorhr_col_getrfnp2_(integer *, integer *, + extern /* Subroutine */ void dlaorhr_col_getrfnp2_(integer *, integer *, doublereal *, integer *, doublereal *, integer *); integer j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer jb, nb; @@ -714,13 +714,13 @@ _col_getrfnp.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAORHR_COL_GETRFNP", &i__1, (ftnlen)19); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -771,7 +771,7 @@ _col_getrfnp.f"> */ } } } - return 0; + return; /* End of DLAORHR_COL_GETRFNP */ diff --git a/lapack-netlib/SRC/dlaorhr_col_getrfnp2.c b/lapack-netlib/SRC/dlaorhr_col_getrfnp2.c index bf0237f4dc..fe828f6d52 100644 --- a/lapack-netlib/SRC/dlaorhr_col_getrfnp2.c +++ b/lapack-netlib/SRC/dlaorhr_col_getrfnp2.c @@ -682,7 +682,7 @@ _col_getrfnp2.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dlaorhr_col_getrfnp2_(integer *m, integer *n, +/* Subroutine */ void dlaorhr_col_getrfnp2_(integer *m, integer *n, doublereal *a, integer *lda, doublereal *d__, integer *info) { /* System generated locals */ @@ -691,13 +691,13 @@ _col_getrfnp2.f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; doublereal sfmin; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer n1, n2; @@ -734,13 +734,13 @@ _col_getrfnp2.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAORHR_COL_GETRFNP2", &i__1, (ftnlen)20); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } if (*m == 1) { @@ -824,7 +824,7 @@ _col_getrfnp2.f"> */ lda, &d__[n1 + 1], &iinfo); } - return 0; + return; /* End of DLAORHR_COL_GETRFNP2 */ diff --git a/lapack-netlib/SRC/dlapll.c b/lapack-netlib/SRC/dlapll.c index 1b8c46ab00..e3d2c80b20 100644 --- a/lapack-netlib/SRC/dlapll.c +++ b/lapack-netlib/SRC/dlapll.c @@ -611,7 +611,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlapll_(integer *n, doublereal *x, integer *incx, +/* Subroutine */ void dlapll_(integer *n, doublereal *x, integer *incx, doublereal *y, integer *incy, doublereal *ssmin) { /* System generated locals */ @@ -620,13 +620,13 @@ f"> */ /* Local variables */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal c__; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal ssmax, a11, a12, a22; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); doublereal tau; @@ -649,7 +649,7 @@ f"> */ /* Function Body */ if (*n <= 1) { *ssmin = 0.; - return 0; + return; } /* Compute the QR factorization of the N-by-2 matrix ( X Y ) */ @@ -671,7 +671,7 @@ f"> */ dlas2_(&a11, &a12, &a22, ssmin, &ssmax); - return 0; + return; /* End of DLAPLL */ diff --git a/lapack-netlib/SRC/dlapmr.c b/lapack-netlib/SRC/dlapmr.c index 04368041a2..c4c4e169bb 100644 --- a/lapack-netlib/SRC/dlapmr.c +++ b/lapack-netlib/SRC/dlapmr.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlapmr_(logical *forwrd, integer *m, integer *n, +/* Subroutine */ void dlapmr_(logical *forwrd, integer *m, integer *n, doublereal *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*m <= 1) { - return 0; + return; } i__1 = *m; @@ -727,7 +727,7 @@ f"> */ } - return 0; + return; /* End of ZLAPMT */ diff --git a/lapack-netlib/SRC/dlapmt.c b/lapack-netlib/SRC/dlapmt.c index 0acbad2b32..6c01575d46 100644 --- a/lapack-netlib/SRC/dlapmt.c +++ b/lapack-netlib/SRC/dlapmt.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlapmt_(logical *forwrd, integer *m, integer *n, +/* Subroutine */ void dlapmt_(logical *forwrd, integer *m, integer *n, doublereal *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *n; @@ -727,7 +727,7 @@ f"> */ } - return 0; + return; /* End of DLAPMT */ diff --git a/lapack-netlib/SRC/dlaqgb.c b/lapack-netlib/SRC/dlaqgb.c index 603a4414fc..6814ef7506 100644 --- a/lapack-netlib/SRC/dlaqgb.c +++ b/lapack-netlib/SRC/dlaqgb.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup doubleGBauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqgb_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void dlaqgb_(integer *m, integer *n, integer *kl, integer *ku, doublereal *ab, integer *ldab, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, char *equed) { @@ -702,7 +702,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -781,7 +781,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of DLAQGB */ diff --git a/lapack-netlib/SRC/dlaqge.c b/lapack-netlib/SRC/dlaqge.c index ef6dd26b42..2191d75191 100644 --- a/lapack-netlib/SRC/dlaqge.c +++ b/lapack-netlib/SRC/dlaqge.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup doubleGEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqge_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dlaqge_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, char *equed) { @@ -685,7 +685,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -749,7 +749,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of DLAQGE */ diff --git a/lapack-netlib/SRC/dlaqp2.c b/lapack-netlib/SRC/dlaqp2.c index a95bc84cb8..00fbc59373 100644 --- a/lapack-netlib/SRC/dlaqp2.c +++ b/lapack-netlib/SRC/dlaqp2.c @@ -661,7 +661,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int dlaqp2_(integer *m, integer *n, integer *offset, +/* Subroutine */ void dlaqp2_(integer *m, integer *n, integer *offset, doublereal *a, integer *lda, integer *jpvt, doublereal *tau, doublereal *vn1, doublereal *vn2, doublereal *work) { @@ -675,15 +675,15 @@ f"> */ doublereal temp2; integer i__, j; doublereal tol3z; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer offpi, itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); integer mn; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); extern integer idamax_(integer *, doublereal *, integer *); doublereal aii; @@ -797,7 +797,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of DLAQP2 */ diff --git a/lapack-netlib/SRC/dlaqps.c b/lapack-netlib/SRC/dlaqps.c index 729df0e289..f8944618bd 100644 --- a/lapack-netlib/SRC/dlaqps.c +++ b/lapack-netlib/SRC/dlaqps.c @@ -693,7 +693,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int dlaqps_(integer *m, integer *n, integer *offset, integer +/* Subroutine */ void dlaqps_(integer *m, integer *n, integer *offset, integer *nb, integer *kb, doublereal *a, integer *lda, integer *jpvt, doublereal *tau, doublereal *vn1, doublereal *vn2, doublereal *auxv, doublereal *f, integer *ldf) @@ -708,18 +708,18 @@ f"> */ doublereal temp2; integer j, k; doublereal tol3z; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); integer rk; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); extern integer idamax_(integer *, doublereal *, integer *); integer lsticc, lastrk; @@ -917,7 +917,7 @@ f"> */ goto L40; } - return 0; + return; /* End of DLAQPS */ diff --git a/lapack-netlib/SRC/dlaqr0.c b/lapack-netlib/SRC/dlaqr0.c index abbf5410cd..5b3ae77794 100644 --- a/lapack-netlib/SRC/dlaqr0.c +++ b/lapack-netlib/SRC/dlaqr0.c @@ -777,7 +777,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqr0_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void dlaqr0_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal *wr, doublereal *wi, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *info) @@ -792,7 +792,7 @@ f"> */ integer ktop; doublereal zdum[1] /* was [1][1] */; integer kacc22, i__, k, itmax, nsmax, nwmax, kwtop; - extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlanv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlaqr3_( logical *, logical *, integer *, integer *, integer *, integer *, @@ -817,7 +817,7 @@ f"> */ integer ku, kv, ls, ns; doublereal ss; integer nw; - extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void dlahqr_(logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, @@ -871,7 +871,7 @@ f"> */ if (*n == 0) { work[1] = 1.; - return 0; + return; } if (*n <= 15) { @@ -953,7 +953,7 @@ f"> */ if (*lwork == -1) { work[1] = (doublereal) lwkopt; - return 0; + return; } /* ==== DLAHQR/DLAQR0 crossover point ==== */ @@ -1340,6 +1340,6 @@ f"> */ /* ==== End of DLAQR0 ==== */ - return 0; + return; } /* dlaqr0_ */ diff --git a/lapack-netlib/SRC/dlaqr1.c b/lapack-netlib/SRC/dlaqr1.c index 9677836fb3..b878f8c31a 100644 --- a/lapack-netlib/SRC/dlaqr1.c +++ b/lapack-netlib/SRC/dlaqr1.c @@ -631,7 +631,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaqr1_(integer *n, doublereal *h__, integer *ldh, +/* Subroutine */ void dlaqr1_(integer *n, doublereal *h__, integer *ldh, doublereal *sr1, doublereal *si1, doublereal *sr2, doublereal *si2, doublereal *v) { @@ -662,7 +662,7 @@ f"> */ /* Function Body */ if (*n != 2 && *n != 3) { - return 0; + return; } if (*n == 2) { @@ -698,6 +698,6 @@ f"> */ sr2) + h21s * h__[(h_dim1 << 1) + 3]; } } - return 0; + return; } /* dlaqr1_ */ diff --git a/lapack-netlib/SRC/dlaqr2.c b/lapack-netlib/SRC/dlaqr2.c index 02aaf62e43..783a1e5d3e 100644 --- a/lapack-netlib/SRC/dlaqr2.c +++ b/lapack-netlib/SRC/dlaqr2.c @@ -795,7 +795,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaqr2_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void dlaqr2_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, doublereal *h__, integer * ldh, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz, integer *ns, integer *nd, doublereal *sr, doublereal *si, doublereal * @@ -811,35 +811,35 @@ f"> */ doublereal beta; integer kend, kcol, info, ifst, ilst, ltop, krow, i__, j, k; doublereal s; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dgemm_(char *, char *, integer *, integer * , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical bulge; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer infqr, kwtop; - extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlanv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal aa, bb, cc; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal dd, cs; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dgehrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); doublereal sn; integer jw; - extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void dlahqr_(logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin, safmax; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dtrexc_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *), @@ -916,7 +916,7 @@ f"> */ if (*lwork == -1) { work[1] = (doublereal) lwkopt; - return 0; + return; } /* ==== Nothing to do ... */ @@ -925,11 +925,11 @@ f"> */ *nd = 0; work[1] = 1.; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -971,7 +971,7 @@ f"> */ } } work[1] = 1.; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1312,6 +1312,6 @@ f"> */ /* ==== End of DLAQR2 ==== */ - return 0; + return; } /* dlaqr2_ */ diff --git a/lapack-netlib/SRC/dlaqr3.c b/lapack-netlib/SRC/dlaqr3.c index e1504a6edd..f88c8cb2c2 100644 --- a/lapack-netlib/SRC/dlaqr3.c +++ b/lapack-netlib/SRC/dlaqr3.c @@ -793,7 +793,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaqr3_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void dlaqr3_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, doublereal *h__, integer * ldh, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz, integer *ns, integer *nd, doublereal *sr, doublereal *si, doublereal * @@ -809,32 +809,32 @@ f"> */ doublereal beta; integer kend, kcol, info, nmin, ifst, ilst, ltop, krow, i__, j, k; doublereal s; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dgemm_(char *, char *, integer *, integer * , integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical bulge; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer infqr, kwtop; - extern /* Subroutine */ int dlanv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlanv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlaqr4_( logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal aa, bb, cc; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal dd, cs; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void dgehrd_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); doublereal sn; integer jw; - extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void dlahqr_(logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, @@ -842,7 +842,7 @@ f"> */ doublereal safmin, safmax; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dtrexc_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *), @@ -928,7 +928,7 @@ f"> */ if (*lwork == -1) { work[1] = (doublereal) lwkopt; - return 0; + return; } /* ==== Nothing to do ... */ @@ -937,11 +937,11 @@ f"> */ *nd = 0; work[1] = 1.; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -983,7 +983,7 @@ f"> */ } } work[1] = 1.; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1332,6 +1332,6 @@ f"> */ /* ==== End of DLAQR3 ==== */ - return 0; + return; } /* dlaqr3_ */ diff --git a/lapack-netlib/SRC/dlaqr4.c b/lapack-netlib/SRC/dlaqr4.c index 4e167011d9..f267f5aaef 100644 --- a/lapack-netlib/SRC/dlaqr4.c +++ b/lapack-netlib/SRC/dlaqr4.c @@ -784,7 +784,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaqr4_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void dlaqr4_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, doublereal *h__, integer *ldh, doublereal *wr, doublereal *wi, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *info) @@ -799,7 +799,7 @@ f"> */ integer ktop; doublereal zdum[1] /* was [1][1] */; integer kacc22, i__, k, itmax, nsmax, nwmax, kwtop; - extern /* Subroutine */ int dlaqr2_(logical *, logical *, integer *, + extern /* Subroutine */ void dlaqr2_(logical *, logical *, integer *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, @@ -820,7 +820,7 @@ f"> */ integer ku, kv, ls, ns; doublereal ss; integer nw; - extern /* Subroutine */ int dlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void dlahqr_(logical *, logical *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, @@ -873,7 +873,7 @@ f"> */ if (*n == 0) { work[1] = 1.; - return 0; + return; } if (*n <= 15) { @@ -955,7 +955,7 @@ f"> */ if (*lwork == -1) { work[1] = (doublereal) lwkopt; - return 0; + return; } /* ==== DLAHQR/DLAQR0 crossover point ==== */ @@ -1335,6 +1335,6 @@ f"> */ /* ==== End of DLAQR4 ==== */ - return 0; + return; } /* dlaqr4_ */ diff --git a/lapack-netlib/SRC/dlaqr5.c b/lapack-netlib/SRC/dlaqr5.c index 5f84da4867..939c4378e4 100644 --- a/lapack-netlib/SRC/dlaqr5.c +++ b/lapack-netlib/SRC/dlaqr5.c @@ -781,7 +781,7 @@ f"> */ /* > ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaqr5_(logical *wantt, logical *wantz, integer *kacc22, +/* Subroutine */ void dlaqr5_(logical *wantt, logical *wantz, integer *kacc22, integer *n, integer *ktop, integer *kbot, integer *nshfts, doublereal *sr, doublereal *si, doublereal *h__, integer *ldh, integer *iloz, integer *ihiz, doublereal *z__, integer *ldz, doublereal *v, integer * @@ -802,24 +802,24 @@ f"> */ integer jtop, jrow, mtop, i__, j, k, m; doublereal alpha; logical accum; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ndcol, incol, krcol, nbmps, i2, k1, i4; - extern /* Subroutine */ int dlaqr1_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlaqr1_(integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlabad_(doublereal *, doublereal *); doublereal h11, h12, h21, h22; integer m22; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); integer ns, nu; doublereal vt[3]; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin, safmax; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal refsum, smlnum, scl; integer kdu, kms; @@ -862,14 +862,14 @@ f"> */ /* Function Body */ if (*nshfts < 2) { - return 0; + return; } /* ==== If the active block is empty or 1-by-1, then there */ /* . is nothing to do. ==== */ if (*ktop >= *kbot) { - return 0; + return; } /* ==== Shuffle shifts into pairs of real shifts and pairs */ @@ -1527,6 +1527,6 @@ f"> */ /* ==== End of DLAQR5 ==== */ - return 0; + return; } /* dlaqr5_ */ diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index 0c63ab8000..cc94b12223 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -286,8 +286,8 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * .. * .. Local Scalars .. DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -554,10 +558,13 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * . Mth bulge. Exploit fact that first two elements * . of row are actually zero. ==== * - REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) - H( K+3, K ) = -REFSUM - H( K+3, K+1 ) = -REFSUM*V( 2, M ) - H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM*T1 + H( K+3, K+1 ) = -REFSUM*T2 + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*T3 * * ==== Calculate reflection to move * . Mth bulge one step. ==== @@ -593,11 +600,13 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ VT ) ALPHA = VT( 1 ) CALL DLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) - REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* - $ H( K+2, K ) ) + T1 = VT( 1 ) + T2 = T1*VT( 2 ) + T3 = T1*VT( 3 ) + REFSUM = H( K+1, K ) + VT( 2 )*H( K+2, K ) * - IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ - $ ABS( REFSUM*VT( 3 ) ).GT.ULP* + IF( ABS( H( K+2, K )-REFSUM*T2 )+ + $ ABS( REFSUM*T3 ).GT.ULP* $ ( ABS( H( K, K ) )+ABS( H( K+1, $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN * @@ -615,7 +624,7 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * . Replace the old reflector with * . the new one. ==== * - H( K+1, K ) = H( K+1, K ) - REFSUM + H( K+1, K ) = H( K+1, K ) - REFSUM*T1 H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) @@ -631,22 +640,25 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +718,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +744,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +763,15 @@ SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/dlaqsb.c b/lapack-netlib/SRC/dlaqsb.c index ee4c226866..55eb4420e4 100644 --- a/lapack-netlib/SRC/dlaqsb.c +++ b/lapack-netlib/SRC/dlaqsb.c @@ -649,7 +649,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqsb_(char *uplo, integer *n, integer *kd, doublereal * +/* Subroutine */ void dlaqsb_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { @@ -684,7 +684,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -739,7 +739,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of DLAQSB */ diff --git a/lapack-netlib/SRC/dlaqsp.c b/lapack-netlib/SRC/dlaqsp.c index a810ac24f0..ad7861aa36 100644 --- a/lapack-netlib/SRC/dlaqsp.c +++ b/lapack-netlib/SRC/dlaqsp.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqsp_(char *uplo, integer *n, doublereal *ap, +/* Subroutine */ void dlaqsp_(char *uplo, integer *n, doublereal *ap, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { /* System generated locals */ @@ -669,7 +669,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -722,7 +722,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of DLAQSP */ diff --git a/lapack-netlib/SRC/dlaqsy.c b/lapack-netlib/SRC/dlaqsy.c index 5e11b257fc..8c144b04fa 100644 --- a/lapack-netlib/SRC/dlaqsy.c +++ b/lapack-netlib/SRC/dlaqsy.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup doubleSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqsy_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dlaqsy_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { /* System generated locals */ @@ -676,7 +676,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -725,7 +725,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of DLAQSY */ diff --git a/lapack-netlib/SRC/dlaqtr.c b/lapack-netlib/SRC/dlaqtr.c index debdd0af7e..1a91df0921 100644 --- a/lapack-netlib/SRC/dlaqtr.c +++ b/lapack-netlib/SRC/dlaqtr.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaqtr_(logical *ltran, logical *lreal, integer *n, +/* Subroutine */ void dlaqtr_(logical *ltran, logical *lreal, integer *n, doublereal *t, integer *ldt, doublereal *b, doublereal *w, doublereal *scale, doublereal *x, doublereal *work, integer *info) { @@ -698,16 +698,16 @@ f"> */ doublereal smin, xmax, d__[4] /* was [2][2] */; integer i__, j, k; doublereal v[4] /* was [2][2] */, z__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern doublereal dasum_(integer *, doublereal *, integer *); - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer jnext, j1, j2; doublereal sminw; integer n1, n2; doublereal xnorm; - extern /* Subroutine */ int dlaln2_(logical *, integer *, integer *, + extern /* Subroutine */ void dlaln2_(logical *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal * , doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -716,7 +716,7 @@ f"> */ doublereal si, xj; extern integer idamax_(integer *, doublereal *, integer *); doublereal scaloc, sr; - extern /* Subroutine */ int dladiv_(doublereal *, doublereal *, + extern /* Subroutine */ void dladiv_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal bignum; logical notran; @@ -749,7 +749,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -1388,7 +1388,7 @@ f"> */ } - return 0; + return; /* End of DLAQTR */ diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f index 1bf65fd601..c4cb95fd32 100644 --- a/lapack-netlib/SRC/dlaqz0.f +++ b/lapack-netlib/SRC/dlaqz0.f @@ -322,7 +322,7 @@ RECURSIVE SUBROUTINE DLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * Local scalars DOUBLE PRECISION :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, - $ TEMP, SWAP + $ TEMP, SWAP, BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -334,7 +334,7 @@ RECURSIVE SUBROUTINE DLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * External Functions EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD, $ DLARTG, DROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -486,6 +486,9 @@ RECURSIVE SUBROUTINE DLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = DLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -562,15 +565,8 @@ RECURSIVE SUBROUTINE DLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it @@ -682,7 +678,7 @@ RECURSIVE SUBROUTINE DLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, NS = MIN( NSHIFTS, ISTOP-ISTART2 ) NS = MIN( NS, N_UNDEFLATED ) - SHIFTPOS = ISTOP-N_DEFLATED-N_UNDEFLATED+1 + SHIFTPOS = ISTOP-N_UNDEFLATED+1 * * Shuffle shifts to put double shifts in front * This ensures that we don't split up a double shift diff --git a/lapack-netlib/SRC/dlar1v.c b/lapack-netlib/SRC/dlar1v.c index be4d3d94aa..221c7f8c4b 100644 --- a/lapack-netlib/SRC/dlar1v.c +++ b/lapack-netlib/SRC/dlar1v.c @@ -738,7 +738,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlar1v_(integer *n, integer *b1, integer *bn, doublereal +/* Subroutine */ void dlar1v_(integer *n, integer *b1, integer *bn, doublereal *lambda, doublereal *d__, doublereal *l, doublereal *ld, doublereal * lld, doublereal *pivmin, doublereal *gaptol, doublereal *z__, logical *wantnc, integer *negcnt, doublereal *ztz, doublereal *mingma, @@ -1031,7 +1031,7 @@ f"> */ *rqcorr = *mingma * tmp; - return 0; + return; /* End of DLAR1V */ diff --git a/lapack-netlib/SRC/dlar2v.c b/lapack-netlib/SRC/dlar2v.c index bdb351890f..d7096f0098 100644 --- a/lapack-netlib/SRC/dlar2v.c +++ b/lapack-netlib/SRC/dlar2v.c @@ -620,7 +620,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlar2v_(integer *n, doublereal *x, doublereal *y, +/* Subroutine */ void dlar2v_(integer *n, doublereal *x, doublereal *y, doublereal *z__, integer *incx, doublereal *c__, doublereal *s, integer *incc) { @@ -678,6 +678,6 @@ f"> */ /* End of DLAR2V */ - return 0; + return; } /* dlar2v_ */ diff --git a/lapack-netlib/SRC/dlarf.c b/lapack-netlib/SRC/dlarf.c index fe30037af0..5bb376a820 100644 --- a/lapack-netlib/SRC/dlarf.c +++ b/lapack-netlib/SRC/dlarf.c @@ -639,7 +639,7 @@ static integer c__1 = 1; /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarf_(char *side, integer *m, integer *n, doublereal *v, +/* Subroutine */ void dlarf_(char *side, integer *m, integer *n, doublereal *v, integer *incv, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work) { @@ -648,12 +648,12 @@ static integer c__1 = 1; doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer lastc, lastv; @@ -745,7 +745,7 @@ static integer c__1 = 1; c_offset], ldc); } } - return 0; + return; /* End of DLARF */ diff --git a/lapack-netlib/SRC/dlarfb.c b/lapack-netlib/SRC/dlarfb.c index f21289d621..b04824a319 100644 --- a/lapack-netlib/SRC/dlarfb.c +++ b/lapack-netlib/SRC/dlarfb.c @@ -711,7 +711,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void dlarfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, doublereal *v, integer * ldv, doublereal *t, integer *ldt, doublereal *c__, integer *ldc, doublereal *work, integer *ldwork) @@ -722,11 +722,11 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -760,7 +760,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (lsame_(trans, "N")) { @@ -1330,7 +1330,7 @@ f"> */ } } - return 0; + return; /* End of DLARFB */ diff --git a/lapack-netlib/SRC/dlarfb_gett.c b/lapack-netlib/SRC/dlarfb_gett.c index 40f67601fa..232d12b23f 100644 --- a/lapack-netlib/SRC/dlarfb_gett.c +++ b/lapack-netlib/SRC/dlarfb_gett.c @@ -904,7 +904,7 @@ gett.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarfb_gett_(char *ident, integer *m, integer *n, +/* Subroutine */ void dlarfb_gett_(char *ident, integer *m, integer *n, integer *k, doublereal *t, integer *ldt, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *work, integer *ldwork) { @@ -914,11 +914,11 @@ gett.f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -951,7 +951,7 @@ gett.f"> */ /* Function Body */ if (*m < 0 || *n <= 0 || *k == 0 || *k > *n) { - return 0; + return; } lnotident = ! lsame_(ident, "I"); @@ -1132,7 +1132,7 @@ gett.f"> */ } } - return 0; + return; /* End of DLARFB_GETT */ diff --git a/lapack-netlib/SRC/dlarfg.c b/lapack-netlib/SRC/dlarfg.c index de8c8ec016..24995a5e1c 100644 --- a/lapack-netlib/SRC/dlarfg.c +++ b/lapack-netlib/SRC/dlarfg.c @@ -615,7 +615,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarfg_(integer *n, doublereal *alpha, doublereal *x, +/* Subroutine */ void dlarfg_(integer *n, doublereal *alpha, doublereal *x, integer *incx, doublereal *tau) { /* System generated locals */ @@ -626,7 +626,7 @@ f"> */ doublereal beta; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal xnorm; extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *); @@ -649,7 +649,7 @@ f"> */ /* Function Body */ if (*n <= 1) { *tau = 0.; - return 0; + return; } i__1 = *n - 1; @@ -705,7 +705,7 @@ f"> */ *alpha = beta; } - return 0; + return; /* End of DLARFG */ diff --git a/lapack-netlib/SRC/dlarfgp.c b/lapack-netlib/SRC/dlarfgp.c index 32223c0e3a..04073a5592 100644 --- a/lapack-netlib/SRC/dlarfgp.c +++ b/lapack-netlib/SRC/dlarfgp.c @@ -613,7 +613,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarfgp_(integer *n, doublereal *alpha, doublereal *x, +/* Subroutine */ void dlarfgp_(integer *n, doublereal *alpha, doublereal *x, integer *incx, doublereal *tau) { /* System generated locals */ @@ -624,7 +624,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ doublereal beta; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal savealpha, xnorm; extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *); @@ -647,7 +647,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Function Body */ if (*n <= 0) { *tau = 0.; - return 0; + return; } i__1 = *n - 1; @@ -753,7 +753,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *alpha = beta; } - return 0; + return; /* End of DLARFGP */ diff --git a/lapack-netlib/SRC/dlarft.c b/lapack-netlib/SRC/dlarft.c index fc6dc02eac..b5e2c542cd 100644 --- a/lapack-netlib/SRC/dlarft.c +++ b/lapack-netlib/SRC/dlarft.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarft_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void dlarft_(char *direct, char *storev, integer *n, integer * k, doublereal *v, integer *ldv, doublereal *tau, doublereal *t, integer *ldt) { @@ -688,14 +688,13 @@ f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer lastv; - extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); integer prevlastv; - extern /* Subroutine */ int mecago_(); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -720,7 +719,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } if (lsame_(direct, "F")) { @@ -881,7 +880,7 @@ f"> */ } } } - return 0; + return; /* End of DLARFT */ diff --git a/lapack-netlib/SRC/dlarfx.c b/lapack-netlib/SRC/dlarfx.c index b852ca535a..1d5400d10f 100644 --- a/lapack-netlib/SRC/dlarfx.c +++ b/lapack-netlib/SRC/dlarfx.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarfx_(char *side, integer *m, integer *n, doublereal * +/* Subroutine */ void dlarfx_(char *side, integer *m, integer *n, doublereal * v, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); @@ -668,7 +668,7 @@ f"> */ /* Function Body */ if (*tau == 0.) { - return 0; + return; } if (lsame_(side, "L")) { @@ -1280,7 +1280,7 @@ f"> */ goto L410; } L410: - return 0; + return; /* End of DLARFX */ diff --git a/lapack-netlib/SRC/dlarfy.c b/lapack-netlib/SRC/dlarfy.c index 409ce9a2db..3101bd5dd7 100644 --- a/lapack-netlib/SRC/dlarfy.c +++ b/lapack-netlib/SRC/dlarfy.c @@ -620,7 +620,7 @@ static integer c__1 = 1; /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarfy_(char *uplo, integer *n, doublereal *v, integer * +/* Subroutine */ void dlarfy_(char *uplo, integer *n, doublereal *v, integer * incv, doublereal *tau, doublereal *c__, integer *ldc, doublereal * work) { @@ -631,11 +631,11 @@ static integer c__1 = 1; /* Local variables */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); - extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal alpha; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -659,7 +659,7 @@ static integer c__1 = 1; /* Function Body */ if (*tau == 0.) { - return 0; + return; } /* Form w:= C * v */ @@ -675,7 +675,7 @@ static integer c__1 = 1; d__1 = -(*tau); dsyr2_(uplo, n, &d__1, &v[1], incv, &work[1], &c__1, &c__[c_offset], ldc); - return 0; + return; /* End of DLARFY */ diff --git a/lapack-netlib/SRC/dlargv.c b/lapack-netlib/SRC/dlargv.c index 70a41e2a71..944e3e241f 100644 --- a/lapack-netlib/SRC/dlargv.c +++ b/lapack-netlib/SRC/dlargv.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlargv_(integer *n, doublereal *x, integer *incx, +/* Subroutine */ void dlargv_(integer *n, doublereal *x, integer *incx, doublereal *y, integer *incy, doublereal *c__, integer *incc) { /* System generated locals */ @@ -673,7 +673,7 @@ f"> */ ix += *incx; /* L10: */ } - return 0; + return; /* End of DLARGV */ diff --git a/lapack-netlib/SRC/dlarmm.c b/lapack-netlib/SRC/dlarmm.c new file mode 100644 index 0000000000..eec5d143a5 --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.c @@ -0,0 +1,605 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLARMM */ + +/* Definition: */ +/* =========== */ + +/* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) */ + +/* DOUBLE PRECISION ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is DOUBLE PRECISION */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is DOUBLE PRECISION */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +doublereal dlarmm_(doublereal *anorm, doublereal *bnorm, doublereal *cnorm) +{ + /* System generated locals */ + doublereal ret_val; + + /* Local variables */ + extern doublereal dlamch_(char *); + doublereal bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = dlamch_("Safe minimum") / dlamch_("Precision"); + bignum = 1. / smlnum / 4.; + +/* Compute a scale factor. */ + + ret_val = 1.; + if (*bnorm <= 1.) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5 / *bnorm; + } + } + return ret_val; + +/* ==== End of DLARMM ==== */ + +} /* dlarmm_ */ + diff --git a/lapack-netlib/SRC/dlarmm.f b/lapack-netlib/SRC/dlarmm.f new file mode 100644 index 0000000000..c360420092 --- /dev/null +++ b/lapack-netlib/SRC/dlarmm.f @@ -0,0 +1,99 @@ +*> \brief \b DLARMM +* +* Definition: +* =========== +* +* DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> DLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is DOUBLE PRECISION +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is DOUBLE PRECISION +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + DOUBLE PRECISION ANORM, BNORM, CNORM +* .. Parameters .. + DOUBLE PRECISION ONE, HALF, FOUR + PARAMETER ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 ) +* .. +* .. Local Scalars .. + DOUBLE PRECISION BIGNUM, SMLNUM +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH + EXTERNAL DLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + DLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + DLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + DLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of DLARMM ==== +* + END diff --git a/lapack-netlib/SRC/dlarnv.c b/lapack-netlib/SRC/dlarnv.c index 65f2cd0912..c5f3ca408d 100644 --- a/lapack-netlib/SRC/dlarnv.c +++ b/lapack-netlib/SRC/dlarnv.c @@ -606,7 +606,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarnv_(integer *idist, integer *iseed, integer *n, +/* Subroutine */ void dlarnv_(integer *idist, integer *iseed, integer *n, doublereal *x) { /* System generated locals */ @@ -616,7 +616,7 @@ f"> */ integer i__; doublereal u[128]; integer il, iv; - extern /* Subroutine */ int dlaruv_(integer *, integer *, doublereal *); + extern /* Subroutine */ void dlaruv_(integer *, integer *, doublereal *); integer il2; @@ -681,7 +681,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of DLARNV */ diff --git a/lapack-netlib/SRC/dlarra.c b/lapack-netlib/SRC/dlarra.c index 771da614fd..9b41fec620 100644 --- a/lapack-netlib/SRC/dlarra.c +++ b/lapack-netlib/SRC/dlarra.c @@ -644,7 +644,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarra_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dlarra_(integer *n, doublereal *d__, doublereal *e, doublereal *e2, doublereal *spltol, doublereal *tnrm, integer *nsplit, integer *isplit, integer *info) { @@ -679,7 +679,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Compute splitting points */ @@ -714,7 +714,7 @@ f"> */ } } isplit[*nsplit] = *n; - return 0; + return; /* End of DLARRA */ diff --git a/lapack-netlib/SRC/dlarra.f b/lapack-netlib/SRC/dlarra.f index 2fb30cd762..a62a079da0 100644 --- a/lapack-netlib/SRC/dlarra.f +++ b/lapack-netlib/SRC/dlarra.f @@ -164,6 +164,7 @@ SUBROUTINE DLARRA( N, D, E, E2, SPLTOL, TNRM, * .. Executable Statements .. * INFO = 0 + NSPLIT = 1 * * Quick return if possible * @@ -172,7 +173,6 @@ SUBROUTINE DLARRA( N, D, E, E2, SPLTOL, TNRM, END IF * * Compute splitting points - NSPLIT = 1 IF(SPLTOL.LT.ZERO) THEN * Criterion based on absolute off-diagonal value TMP1 = ABS(SPLTOL)* TNRM diff --git a/lapack-netlib/SRC/dlarrb.c b/lapack-netlib/SRC/dlarrb.c index 8a046fdd86..cdf261b4c1 100644 --- a/lapack-netlib/SRC/dlarrb.c +++ b/lapack-netlib/SRC/dlarrb.c @@ -703,7 +703,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarrb_(integer *n, doublereal *d__, doublereal *lld, +/* Subroutine */ void dlarrb_(integer *n, doublereal *d__, doublereal *lld, integer *ifirst, integer *ilast, doublereal *rtol1, doublereal *rtol2, integer *offset, doublereal *w, doublereal *wgap, doublereal *werr, doublereal *work, integer *iwork, doublereal *pivmin, doublereal * @@ -751,7 +751,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } maxitr = (integer) ((log(*spdiam + *pivmin) - log(*pivmin)) / log(2.)) + @@ -933,7 +933,7 @@ f"> */ wgap[ii - 1] = f2cmax(d__1,d__2); /* L111: */ } - return 0; + return; /* End of DLARRB */ diff --git a/lapack-netlib/SRC/dlarrc.c b/lapack-netlib/SRC/dlarrc.c index 804f2d361a..c2e3c4054c 100644 --- a/lapack-netlib/SRC/dlarrc.c +++ b/lapack-netlib/SRC/dlarrc.c @@ -645,7 +645,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarrc_(char *jobt, integer *n, doublereal *vl, +/* Subroutine */ void dlarrc_(char *jobt, integer *n, doublereal *vl, doublereal *vu, doublereal *d__, doublereal *e, doublereal *pivmin, integer *eigcnt, integer *lcnt, integer *rcnt, integer *info) { @@ -679,7 +679,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } *lcnt = 0; @@ -752,7 +752,7 @@ f"> */ } } *eigcnt = *rcnt - *lcnt; - return 0; + return; /* end of DLARRC */ diff --git a/lapack-netlib/SRC/dlarrc.f b/lapack-netlib/SRC/dlarrc.f index 55a17626ac..d3fea59666 100644 --- a/lapack-netlib/SRC/dlarrc.f +++ b/lapack-netlib/SRC/dlarrc.f @@ -167,6 +167,9 @@ SUBROUTINE DLARRC( JOBT, N, VL, VU, D, E, PIVMIN, * .. Executable Statements .. * INFO = 0 + LCNT = 0 + RCNT = 0 + EIGCNT = 0 * * Quick return if possible * @@ -174,9 +177,6 @@ SUBROUTINE DLARRC( JOBT, N, VL, VU, D, E, PIVMIN, RETURN END IF * - LCNT = 0 - RCNT = 0 - EIGCNT = 0 MATT = LSAME( JOBT, 'T' ) diff --git a/lapack-netlib/SRC/dlarrd.c b/lapack-netlib/SRC/dlarrd.c index aef42a9a00..89c51279f4 100644 --- a/lapack-netlib/SRC/dlarrd.c +++ b/lapack-netlib/SRC/dlarrd.c @@ -844,7 +844,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarrd_(char *range, char *order, integer *n, doublereal +/* Subroutine */ void dlarrd_(char *range, char *order, integer *n, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *gers, doublereal *reltol, doublereal *d__, doublereal *e, doublereal *e2, doublereal *pivmin, integer *nsplit, integer *isplit, integer *m, @@ -869,7 +869,7 @@ f"> */ extern doublereal dlamch_(char *); doublereal gu; integer ibegin, iw; - extern /* Subroutine */ int dlaebz_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaebz_(integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, @@ -915,7 +915,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Decode RANGE */ @@ -950,7 +950,7 @@ f"> */ } if (*info != 0) { - return 0; + return; } /* Initialize error flags */ *info = 0; @@ -959,7 +959,7 @@ f"> */ /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Simplification: */ if (irange == 3 && *il == 1 && *iu == *n) { @@ -980,7 +980,7 @@ f"> */ iblock[1] = 1; indexw[1] = 1; } - return 0; + return; } /* NB is the minimum vector length for vector bisection, or 0 */ /* if only scalar is to be done. */ @@ -1045,7 +1045,7 @@ f"> */ , &iout, &iwork[1], &w[1], &iblock[1], &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* On exit, output intervals may not be ordered by ascending negcount */ if (iwork[6] == *iu) { @@ -1067,7 +1067,7 @@ f"> */ /* and [WUL, WU] contains a value with negcount NWU. */ if (nwl < 0 || nwl >= *n || nwu < 1 || nwu > *n) { *info = 4; - return 0; + return; } } else if (irange == 2) { *wl = *vl; @@ -1205,7 +1205,7 @@ f"> */ w[*m + 1], &iblock[*m + 1], &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } nwl += iwork[1]; @@ -1220,7 +1220,7 @@ f"> */ &w[*m + 1], &iblock[*m + 1], &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* Copy eigenvalues into W and IBLOCK */ @@ -1413,7 +1413,7 @@ f"> */ if (toofew) { *info += 2; } - return 0; + return; /* End of DLARRD */ diff --git a/lapack-netlib/SRC/dlarrd.f b/lapack-netlib/SRC/dlarrd.f index 08dfd02c30..ea1896adf2 100644 --- a/lapack-netlib/SRC/dlarrd.f +++ b/lapack-netlib/SRC/dlarrd.f @@ -381,6 +381,7 @@ SUBROUTINE DLARRD( RANGE, ORDER, N, VL, VU, IL, IU, GERS, * .. Executable Statements .. * INFO = 0 + M = 0 * * Quick return if possible * @@ -424,14 +425,9 @@ SUBROUTINE DLARRD( RANGE, ORDER, N, VL, VU, IL, IU, GERS, END IF * Initialize error flags - INFO = 0 NCNVRG = .FALSE. TOOFEW = .FALSE. -* Quick return if possible - M = 0 - IF( N.EQ.0 ) RETURN - * Simplification: IF( IRANGE.EQ.INDRNG .AND. IL.EQ.1 .AND. IU.EQ.N ) IRANGE = 1 diff --git a/lapack-netlib/SRC/dlarre.c b/lapack-netlib/SRC/dlarre.c index 6256604565..367cf1d0bb 100644 --- a/lapack-netlib/SRC/dlarre.c +++ b/lapack-netlib/SRC/dlarre.c @@ -817,7 +817,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarre_(char *range, integer *n, doublereal *vl, +/* Subroutine */ void dlarre_(char *range, integer *n, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *d__, doublereal *e, doublereal *e2, doublereal *rtol1, doublereal *rtol2, doublereal * spltol, integer *nsplit, integer *isplit, integer *m, doublereal *w, @@ -841,11 +841,11 @@ f"> */ doublereal avgap, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical norep; doublereal s1, s2; - extern /* Subroutine */ int dlasq2_(integer *, doublereal *, integer *); + extern /* Subroutine */ void dlasq2_(integer *, doublereal *, integer *); integer mb; doublereal gl; integer in; @@ -856,7 +856,7 @@ f"> */ logical forceb; integer irange; doublereal sgndef; - extern /* Subroutine */ int dlarra_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarra_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *), dlarrb_(integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, @@ -866,7 +866,7 @@ f"> */ *, doublereal *, integer *, integer *, integer *, integer *); integer wbegin; doublereal safmin, spdiam; - extern /* Subroutine */ int dlarrd_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dlarrd_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer * , integer *, integer *, doublereal *, doublereal *, doublereal *, @@ -877,7 +877,7 @@ f"> */ ; logical usedqd; doublereal clwdth, isleft; - extern /* Subroutine */ int dlarnv_(integer *, integer *, integer *, + extern /* Subroutine */ void dlarnv_(integer *, integer *, integer *, doublereal *); doublereal isrght, bsrtol, dpivot; integer cnt; @@ -915,7 +915,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Decode RANGE */ @@ -950,7 +950,7 @@ f"> */ } /* store the shift for the initial RRR, which is zero in this case */ e[1] = 0.; - return 0; + return; } /* General case: tridiagonal matrix of order > 1 */ @@ -1015,7 +1015,7 @@ f"> */ vl, vu, &iblock[1], &indexw[1], &work[1], &iwork[1], &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* Make sure that the entries M+1 to N in W, WERR, IBLOCK, INDEXW are 0 */ i__1 = *n; @@ -1124,7 +1124,7 @@ f"> */ rtl, &tmp, &tmp1, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* Computing MAX */ d__2 = gl, d__3 = tmp - tmp1 - eps * 100. * (d__1 = tmp - tmp1, @@ -1134,7 +1134,7 @@ f"> */ rtl, &tmp, &tmp1, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* Computing MIN */ d__2 = gu, d__3 = tmp + tmp1 + eps * 100. * (d__1 = tmp + tmp1, @@ -1322,7 +1322,7 @@ f"> */ /* if the program reaches this point, no base representation could be */ /* found in MAXTRY iterations. */ *info = 2; - return 0; + return; L83: /* At this point, we have found an initial base representation */ /* T - SIGMA I = L D L^T with not too much element growth. */ @@ -1388,7 +1388,7 @@ f"> */ iinfo); if (iinfo != 0) { *info = -4; - return 0; + return; } /* DLARRB computes all gaps correctly except for the last one */ /* Record distance to VU/GU */ @@ -1431,14 +1431,14 @@ f"> */ /* and should be changed. The index is in IWORK(1) and the */ /* gap is in WORK(N+1) */ *info = -5; - return 0; + return; } else { /* Test that all eigenvalues are positive as expected */ i__2 = in; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] < 0.) { *info = -6; - return 0; + return; } /* L149: */ } @@ -1488,7 +1488,7 @@ f"> */ ; } - return 0; + return; /* end of DLARRE */ diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f index afbfe6379c..70f59b8295 100644 --- a/lapack-netlib/SRC/dlarre.f +++ b/lapack-netlib/SRC/dlarre.f @@ -367,6 +367,8 @@ SUBROUTINE DLARRE( RANGE, N, VL, VU, IL, IU, D, E, E2, * INFO = 0 + NSPLIT = 0 + M = 0 * * Quick return if possible * @@ -384,8 +386,6 @@ SUBROUTINE DLARRE( RANGE, N, VL, VU, IL, IU, D, E, E2, IRANGE = INDRNG END IF - M = 0 - * Get machine constants SAFMIN = DLAMCH( 'S' ) EPS = DLAMCH( 'P' ) diff --git a/lapack-netlib/SRC/dlarrf.c b/lapack-netlib/SRC/dlarrf.c index 11664d0855..82e631d54b 100644 --- a/lapack-netlib/SRC/dlarrf.c +++ b/lapack-netlib/SRC/dlarrf.c @@ -704,7 +704,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarrf_(integer *n, doublereal *d__, doublereal *l, +/* Subroutine */ void dlarrf_(integer *n, doublereal *d__, doublereal *l, doublereal *ld, integer *clstrt, integer *clend, doublereal *w, doublereal *wgap, doublereal *werr, doublereal *spdiam, doublereal * clgapl, doublereal *clgapr, doublereal *pivmin, doublereal *sigma, @@ -723,7 +723,7 @@ f"> */ integer i__; doublereal s, avgap, ldmax, rdmax; integer shift; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal bestshift, smlgrowth; logical dorrr1; @@ -766,7 +766,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } fact = 2.; @@ -1011,7 +1011,7 @@ f"> */ goto L5; } else { *info = 1; - return 0; + return; } } L100: @@ -1022,7 +1022,7 @@ f"> */ i__1 = *n - 1; dcopy_(&i__1, &work[*n + 1], &c__1, &lplus[1], &c__1); } - return 0; + return; /* End of DLARRF */ diff --git a/lapack-netlib/SRC/dlarrj.c b/lapack-netlib/SRC/dlarrj.c index cc5ad7a052..1a45ffda66 100644 --- a/lapack-netlib/SRC/dlarrj.c +++ b/lapack-netlib/SRC/dlarrj.c @@ -675,7 +675,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarrj_(integer *n, doublereal *d__, doublereal *e2, +/* Subroutine */ void dlarrj_(integer *n, doublereal *d__, doublereal *e2, integer *ifirst, integer *ilast, doublereal *rtol, integer *offset, doublereal *w, doublereal *werr, doublereal *work, integer *iwork, doublereal *pivmin, doublereal *spdiam, integer *info) @@ -718,7 +718,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } maxitr = (integer) ((log(*spdiam + *pivmin) - log(*pivmin)) / log(2.)) + @@ -910,7 +910,7 @@ f"> */ /* L110: */ } - return 0; + return; /* End of DLARRJ */ diff --git a/lapack-netlib/SRC/dlarrk.c b/lapack-netlib/SRC/dlarrk.c index 8e0bec8871..73258f5ebd 100644 --- a/lapack-netlib/SRC/dlarrk.c +++ b/lapack-netlib/SRC/dlarrk.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlarrk_(integer *n, integer *iw, doublereal *gl, +/* Subroutine */ void dlarrk_(integer *n, integer *iw, doublereal *gl, doublereal *gu, doublereal *d__, doublereal *e2, doublereal *pivmin, doublereal *reltol, doublereal *w, doublereal *werr, integer *info) { @@ -691,7 +691,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *info = 0; - return 0; + return; } /* Get machine constants */ @@ -760,7 +760,7 @@ f"> */ *w = (left + right) * .5; *werr = (d__1 = right - left, abs(d__1)) * .5; - return 0; + return; /* End of DLARRK */ diff --git a/lapack-netlib/SRC/dlarrr.c b/lapack-netlib/SRC/dlarrr.c index 5b60310659..f27afc8624 100644 --- a/lapack-netlib/SRC/dlarrr.c +++ b/lapack-netlib/SRC/dlarrr.c @@ -604,7 +604,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarrr_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dlarrr_(integer *n, doublereal *d__, doublereal *e, integer *info) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *info = 0; - return 0; + return; } /* As a default, do NOT go for relative-accuracy preserving computations. */ @@ -697,7 +697,7 @@ f"> */ L11: if (yesrel) { *info = 0; - return 0; + return; } else { } @@ -716,7 +716,7 @@ f"> */ /* to the computed eigenvectors (and the support) */ - return 0; + return; /* END OF DLARRR */ diff --git a/lapack-netlib/SRC/dlarrv.c b/lapack-netlib/SRC/dlarrv.c index 095642d499..d9d6725542 100644 --- a/lapack-netlib/SRC/dlarrv.c +++ b/lapack-netlib/SRC/dlarrv.c @@ -804,7 +804,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dlarrv_(integer *n, doublereal *vl, doublereal *vu, +/* Subroutine */ void dlarrv_(integer *n, doublereal *vl, doublereal *vu, doublereal *d__, doublereal *l, doublereal *pivmin, integer *isplit, integer *m, integer *dol, integer *dou, doublereal *minrgp, doublereal *rtol1, doublereal *rtol2, doublereal *w, doublereal *werr, @@ -825,7 +825,7 @@ f"> */ integer wend, iter; doublereal bstw; integer minwsize, itmp1, i__, j, k, p, q; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer indld; doublereal fudge; @@ -835,12 +835,12 @@ f"> */ doublereal resid; logical eskip; doublereal right; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer nclus, zfrom; doublereal rqtol; integer iindc1, iindc2; - extern /* Subroutine */ int dlar1v_(integer *, integer *, integer *, + extern /* Subroutine */ void dlar1v_(integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, logical *, integer *, doublereal *, doublereal *, integer *, integer *, @@ -857,7 +857,7 @@ f"> */ logical needbs; integer indlld; doublereal sgndef, mingma; - extern /* Subroutine */ int dlarrb_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrb_(integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -867,7 +867,7 @@ f"> */ doublereal savgap; integer ndepth; doublereal ssigma; - extern /* Subroutine */ int dlarrf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrf_(integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, @@ -875,7 +875,7 @@ f"> */ logical usedbs; integer iindwk, offset; doublereal gaptol; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer newcls, oldfst, indwrk, windex, oldlst; logical usedrq; @@ -921,7 +921,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0 || *m <= 0) { - return 0; + return; } /* The first N entries of WORK are reserved for the eigenvalues */ @@ -1071,7 +1071,7 @@ f"> */ /* This is a crude protection against infinitely deep trees */ if (ndepth > *m) { *info = -2; - return 0; + return; } /* breadth first processing of the current level of the representation */ /* tree: OLDNCL = number of clusters on current level */ @@ -1151,7 +1151,7 @@ f"> */ iindwk], pivmin, &spdiam, &in, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* We also recompute the extremal gaps. W holds all eigenvalues */ /* of the unshifted matrix and must be used for computation */ @@ -1318,7 +1318,7 @@ f"> */ iwork[k] = newlst; } else { *info = -2; - return 0; + return; } } else { @@ -1422,7 +1422,7 @@ f"> */ iindwk], pivmin, &spdiam, &itmp1, &iinfo); if (iinfo != 0) { *info = -3; - return 0; + return; } lambda = work[windex]; /* Reset twist index from inaccurate LAMBDA to */ @@ -1517,7 +1517,7 @@ f"> */ goto L120; } else { *info = 5; - return 0; + return; } } else { stp2ii = FALSE_; @@ -1610,7 +1610,7 @@ f"> */ ; } - return 0; + return; /* End of DLARRV */ diff --git a/lapack-netlib/SRC/dlarscl2.c b/lapack-netlib/SRC/dlarscl2.c index 06995b9240..5a34dfaf07 100644 --- a/lapack-netlib/SRC/dlarscl2.c +++ b/lapack-netlib/SRC/dlarscl2.c @@ -599,7 +599,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlarscl2_(integer *m, integer *n, doublereal *d__, +/* Subroutine */ void dlarscl2_(integer *m, integer *n, doublereal *d__, doublereal *x, integer *ldx) { /* System generated locals */ @@ -632,6 +632,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__ + j * x_dim1] /= d__[i__]; } } - return 0; + return; } /* dlarscl2_ */ diff --git a/lapack-netlib/SRC/dlarscl2.f b/lapack-netlib/SRC/dlarscl2.f index 2468e2702d..cc4b9aa3c1 100644 --- a/lapack-netlib/SRC/dlarscl2.f +++ b/lapack-netlib/SRC/dlarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> DLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> DLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is DOUBLE PRECISION array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlartg.c b/lapack-netlib/SRC/dlartg.c index 7c71036858..c4e3889b1e 100644 --- a/lapack-netlib/SRC/dlartg.c +++ b/lapack-netlib/SRC/dlartg.c @@ -608,7 +608,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlartg_(doublereal *f, doublereal *g, doublereal *cs, +/* Subroutine */ void dlartg_(doublereal *f, doublereal *g, doublereal *cs, doublereal *sn, doublereal *r__) { /* System generated locals */ @@ -722,7 +722,7 @@ f"> */ *r__ = -(*r__); } } - return 0; + return; /* End of DLARTG */ diff --git a/lapack-netlib/SRC/dlartg.f90 b/lapack-netlib/SRC/dlartg.f90 index ef8c6e3865..b7049c32f1 100644 --- a/lapack-netlib/SRC/dlartg.f90 +++ b/lapack-netlib/SRC/dlartg.f90 @@ -11,7 +11,7 @@ ! SUBROUTINE DLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C, F, G, R, S +! REAL(wp) C, F, G, R, S ! .. ! !> \par Purpose: @@ -45,8 +45,6 @@ !> floating point operations (saves work in DBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine DLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, half=>dhalf, one=>done, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine DLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -143,20 +145,18 @@ subroutine DLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/lapack-netlib/SRC/dlartgp.c b/lapack-netlib/SRC/dlartgp.c index 903229a481..6bdb30f957 100644 --- a/lapack-netlib/SRC/dlartgp.c +++ b/lapack-netlib/SRC/dlartgp.c @@ -610,7 +610,7 @@ static doublereal c_b6 = 1.; /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlartgp_(doublereal *f, doublereal *g, doublereal *cs, +/* Subroutine */ void dlartgp_(doublereal *f, doublereal *g, doublereal *cs, doublereal *sn, doublereal *r__) { /* System generated locals */ @@ -724,7 +724,7 @@ static doublereal c_b6 = 1.; *r__ = -(*r__); } } - return 0; + return; /* End of DLARTGP */ diff --git a/lapack-netlib/SRC/dlartgs.c b/lapack-netlib/SRC/dlartgs.c index 878ddee3a0..e995262759 100644 --- a/lapack-netlib/SRC/dlartgs.c +++ b/lapack-netlib/SRC/dlartgs.c @@ -602,13 +602,13 @@ he bidiagonal SVD problem. */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlartgs_(doublereal *x, doublereal *y, doublereal *sigma, +/* Subroutine */ void dlartgs_(doublereal *x, doublereal *y, doublereal *sigma, doublereal *cs, doublereal *sn) { doublereal r__, s, w, z__; extern doublereal dlamch_(char *); doublereal thresh; - extern /* Subroutine */ int dlartgp_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartgp_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -657,7 +657,7 @@ he bidiagonal SVD problem. */ dlartgp_(&w, &z__, sn, cs, &r__); - return 0; + return; /* End DLARTGS */ diff --git a/lapack-netlib/SRC/dlartv.c b/lapack-netlib/SRC/dlartv.c index d72f5c6948..e5dc6e61b8 100644 --- a/lapack-netlib/SRC/dlartv.c +++ b/lapack-netlib/SRC/dlartv.c @@ -618,7 +618,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlartv_(integer *n, doublereal *x, integer *incx, +/* Subroutine */ void dlartv_(integer *n, doublereal *x, integer *incx, doublereal *y, integer *incy, doublereal *c__, doublereal *s, integer *incc) { @@ -660,7 +660,7 @@ f"> */ ic += *incc; /* L10: */ } - return 0; + return; /* End of DLARTV */ diff --git a/lapack-netlib/SRC/dlaruv.c b/lapack-netlib/SRC/dlaruv.c index bfdb82b206..c6a996a4b8 100644 --- a/lapack-netlib/SRC/dlaruv.c +++ b/lapack-netlib/SRC/dlaruv.c @@ -604,7 +604,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaruv_(integer *iseed, integer *n, doublereal *x) +/* Subroutine */ void dlaruv_(integer *iseed, integer *n, doublereal *x) { /* Initialized data */ @@ -726,7 +726,7 @@ f"> */ iseed[2] = it2; iseed[3] = it3; iseed[4] = it4; - return 0; + return; /* End of DLARUV */ diff --git a/lapack-netlib/SRC/dlarz.c b/lapack-netlib/SRC/dlarz.c index cfd7e1dab3..611daefd82 100644 --- a/lapack-netlib/SRC/dlarz.c +++ b/lapack-netlib/SRC/dlarz.c @@ -659,7 +659,7 @@ static doublereal c_b5 = 1.; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarz_(char *side, integer *m, integer *n, integer *l, +/* Subroutine */ void dlarz_(char *side, integer *m, integer *n, integer *l, doublereal *v, integer *incv, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work) { @@ -668,11 +668,11 @@ static doublereal c_b5 = 1.; doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer @@ -756,7 +756,7 @@ static doublereal c_b5 = 1.; } - return 0; + return; /* End of DLARZ */ diff --git a/lapack-netlib/SRC/dlarzb.c b/lapack-netlib/SRC/dlarzb.c index 7565e5d021..61ca619927 100644 --- a/lapack-netlib/SRC/dlarzb.c +++ b/lapack-netlib/SRC/dlarzb.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarzb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void dlarzb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, doublereal *v, integer *ldv, doublereal *t, integer *ldt, doublereal *c__, integer * ldc, doublereal *work, integer *ldwork) @@ -708,15 +708,15 @@ f"> */ /* Local variables */ integer info, i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, - doublereal *, integer *), xerbla_( - char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); char transt[1]; @@ -747,7 +747,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } /* Check for currently supported options */ @@ -761,7 +761,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("DLARZB", &i__1, (ftnlen)6); - return 0; + return; } if (lsame_(trans, "N")) { @@ -867,7 +867,7 @@ f"> */ } - return 0; + return; /* End of DLARZB */ diff --git a/lapack-netlib/SRC/dlarzt.c b/lapack-netlib/SRC/dlarzt.c index 166d7a3f32..fd508cb5d1 100644 --- a/lapack-netlib/SRC/dlarzt.c +++ b/lapack-netlib/SRC/dlarzt.c @@ -699,7 +699,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlarzt_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void dlarzt_(char *direct, char *storev, integer *n, integer * k, doublereal *v, integer *ldv, doublereal *tau, doublereal *t, integer *ldt) { @@ -710,11 +710,12 @@ f"> */ /* Local variables */ integer info, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -747,7 +748,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("DLARZT", &i__1, (ftnlen)6); - return 0; + return; } for (i__ = *k; i__ >= 1; --i__) { @@ -785,7 +786,7 @@ f"> */ } /* L20: */ } - return 0; + return; /* End of DLARZT */ diff --git a/lapack-netlib/SRC/dlas2.c b/lapack-netlib/SRC/dlas2.c index a4fe5ef120..4090a4da76 100644 --- a/lapack-netlib/SRC/dlas2.c +++ b/lapack-netlib/SRC/dlas2.c @@ -618,7 +618,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlas2_(doublereal *f, doublereal *g, doublereal *h__, +/* Subroutine */ void dlas2_(doublereal *f, doublereal *g, doublereal *h__, doublereal *ssmin, doublereal *ssmax) { /* System generated locals */ @@ -685,7 +685,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } } - return 0; + return; /* End of DLAS2 */ diff --git a/lapack-netlib/SRC/dlascl.c b/lapack-netlib/SRC/dlascl.c index 42cab9b186..87d6903b72 100644 --- a/lapack-netlib/SRC/dlascl.c +++ b/lapack-netlib/SRC/dlascl.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlascl_(char *type__, integer *kl, integer *ku, +/* Subroutine */ void dlascl_(char *type__, integer *kl, integer *ku, doublereal *cfrom, doublereal *cto, integer *m, integer *n, doublereal *a, integer *lda, integer *info) { @@ -743,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASCL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } /* Get machine parameters */ @@ -912,7 +912,7 @@ f"> */ goto L10; } - return 0; + return; /* End of DLASCL */ diff --git a/lapack-netlib/SRC/dlascl.f b/lapack-netlib/SRC/dlascl.f index 05ad1c4f3c..0a4bf21ce1 100644 --- a/lapack-netlib/SRC/dlascl.f +++ b/lapack-netlib/SRC/dlascl.f @@ -272,6 +272,8 @@ SUBROUTINE DLASCL( TYPE, KL, KU, CFROM, CTO, M, N, A, LDA, INFO ) ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/dlascl2.c b/lapack-netlib/SRC/dlascl2.c index 2a63e73211..b677a534ec 100644 --- a/lapack-netlib/SRC/dlascl2.c +++ b/lapack-netlib/SRC/dlascl2.c @@ -599,7 +599,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlascl2_(integer *m, integer *n, doublereal *d__, +/* Subroutine */ void dlascl2_(integer *m, integer *n, doublereal *d__, doublereal *x, integer *ldx) { /* System generated locals */ @@ -632,6 +632,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__ + j * x_dim1] *= d__[i__]; } } - return 0; + return; } /* dlascl2_ */ diff --git a/lapack-netlib/SRC/dlascl2.f b/lapack-netlib/SRC/dlascl2.f index 901e43c494..568e296ad0 100644 --- a/lapack-netlib/SRC/dlascl2.f +++ b/lapack-netlib/SRC/dlascl2.f @@ -1,4 +1,4 @@ -*> \brief \b DLASCL2 performs diagonal scaling on a vector. +*> \brief \b DLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> DLASCL2 performs a diagonal scaling on a vector: +*> DLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is DOUBLE PRECISION array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/dlasd0.c b/lapack-netlib/SRC/dlasd0.c index c702665b03..fdc8be7d46 100644 --- a/lapack-netlib/SRC/dlasd0.c +++ b/lapack-netlib/SRC/dlasd0.c @@ -664,7 +664,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd0_(integer *n, integer *sqre, doublereal *d__, +/* Subroutine */ void dlasd0_(integer *n, integer *sqre, doublereal *d__, doublereal *e, doublereal *u, integer *ldu, doublereal *vt, integer * ldvt, integer *smlsiz, integer *iwork, doublereal *work, integer * info) @@ -677,17 +677,17 @@ f"> */ integer idxq, nlvl, i__, j, m; doublereal alpha; integer inode, ndiml, idxqc, ndimr, itemp, sqrei, i1; - extern /* Subroutine */ int dlasd1_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasd1_(integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *); integer ic, lf, nd, ll, nl, nr; - extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void dlasdq_(char *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dlasdt_(integer *, integer *, - integer *, integer *, integer *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + integer *, integer *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer im1, ncc, nlf, nrf, iwk, lvl, ndb1, nlp1, nrp1; @@ -735,7 +735,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD0", &i__1, (ftnlen)6); - return 0; + return; } /* If the input matrix is too small, call DLASDQ to find the SVD. */ @@ -743,7 +743,7 @@ f"> */ if (*n <= *smlsiz) { dlasdq_("U", sqre, n, &m, n, &c__0, &d__[1], &e[1], &vt[vt_offset], ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[1], info); - return 0; + return; } /* Set up the computation tree. */ @@ -783,7 +783,7 @@ f"> */ nlf + nlf * vt_dim1], ldvt, &u[nlf + nlf * u_dim1], ldu, &u[ nlf + nlf * u_dim1], ldu, &work[1], info); if (*info != 0) { - return 0; + return; } itemp = idxq + nlf - 2; i__2 = nl; @@ -801,7 +801,7 @@ f"> */ nrf + nrf * vt_dim1], ldvt, &u[nrf + nrf * u_dim1], ldu, &u[ nrf + nrf * u_dim1], ldu, &work[1], info); if (*info != 0) { - return 0; + return; } itemp = idxq + ic; i__2 = nr; @@ -824,7 +824,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -849,14 +849,14 @@ f"> */ /* Report the possible convergence failure. */ if (*info != 0) { - return 0; + return; } /* L40: */ } /* L50: */ } - return 0; + return; /* End of DLASD0 */ diff --git a/lapack-netlib/SRC/dlasd1.c b/lapack-netlib/SRC/dlasd1.c index 8b0d7031cf..c45c2fe7d9 100644 --- a/lapack-netlib/SRC/dlasd1.c +++ b/lapack-netlib/SRC/dlasd1.c @@ -720,7 +720,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd1_(integer *nl, integer *nr, integer *sqre, +/* Subroutine */ void dlasd1_(integer *nl, integer *nr, integer *sqre, doublereal *d__, doublereal *alpha, doublereal *beta, doublereal *u, integer *ldu, doublereal *vt, integer *ldvt, integer *idxq, integer * iwork, doublereal *work, integer *info) @@ -731,7 +731,7 @@ f"> */ /* Local variables */ integer idxc, idxp, ldvt2, i__, k, m, n, n1, n2; - extern /* Subroutine */ int dlasd2_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasd2_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, @@ -741,11 +741,11 @@ f"> */ doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *); integer iq; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer iz; - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, integer *, integer *, integer *); integer isigma; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -790,7 +790,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD1", &i__1, (ftnlen)6); - return 0; + return; } n = *nl + *nr + 1; @@ -848,7 +848,7 @@ f"> */ /* Report the convergence failure. */ if (*info != 0) { - return 0; + return; } /* Unscale. */ @@ -861,7 +861,7 @@ f"> */ n2 = n - k; dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]); - return 0; + return; /* End of DLASD1 */ diff --git a/lapack-netlib/SRC/dlasd2.c b/lapack-netlib/SRC/dlasd2.c index 8e915c9493..457957a061 100644 --- a/lapack-netlib/SRC/dlasd2.c +++ b/lapack-netlib/SRC/dlasd2.c @@ -782,7 +782,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd2_(integer *nl, integer *nr, integer *sqre, integer +/* Subroutine */ void dlasd2_(integer *nl, integer *nr, integer *sqre, integer *k, doublereal *d__, doublereal *z__, doublereal *alpha, doublereal * beta, doublereal *u, integer *ldu, doublereal *vt, integer *ldvt, doublereal *dsigma, doublereal *u2, integer *ldu2, doublereal *vt2, @@ -796,14 +796,14 @@ f"> */ /* Local variables */ integer idxi, idxj; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer ctot[4]; doublereal c__; integer i__, j, m, n; doublereal s; integer idxjp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer jprev, k2; doublereal z1; @@ -811,11 +811,11 @@ f"> */ integer ct; extern doublereal dlamch_(char *); integer jp; - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, integer *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal hlftol, eps, tau, tol; integer psm[4], nlp1, nlp2; @@ -879,7 +879,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD2", &i__1, (ftnlen)6); - return 0; + return; } nlp1 = *nl + 1; @@ -1206,7 +1206,7 @@ f"> */ /* L190: */ } - return 0; + return; /* End of DLASD2 */ diff --git a/lapack-netlib/SRC/dlasd3.c b/lapack-netlib/SRC/dlasd3.c index d648ecbeae..d4b7472957 100644 --- a/lapack-netlib/SRC/dlasd3.c +++ b/lapack-netlib/SRC/dlasd3.c @@ -740,7 +740,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd3_(integer *nl, integer *nr, integer *sqre, integer +/* Subroutine */ void dlasd3_(integer *nl, integer *nr, integer *sqre, integer *k, doublereal *d__, doublereal *q, integer *ldq, doublereal *dsigma, doublereal *u, integer *ldu, doublereal *u2, integer *ldu2, doublereal *vt, integer *ldvt, doublereal *vt2, integer *ldvt2, @@ -755,23 +755,23 @@ f"> */ doublereal temp; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j, m, n; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ctemp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer ktemp; extern doublereal dlamc3_(doublereal *, doublereal *); - extern /* Subroutine */ int dlasd4_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlasd4_(integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); integer jc; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer - *, doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + *, doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal rho; integer nlp1, nlp2, nrp1; @@ -841,7 +841,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -858,7 +858,7 @@ f"> */ /* L10: */ } } - return 0; + return; } /* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can */ @@ -904,7 +904,7 @@ f"> */ /* If the zero finder fails, report the convergence failure. */ if (*info != 0) { - return 0; + return; } /* L30: */ } @@ -1007,7 +1007,7 @@ f"> */ if (*k == 2) { dgemm_("N", "N", k, &m, k, &c_b13, &q[q_offset], ldq, &vt2[vt2_offset] , ldvt2, &c_b26, &vt[vt_offset], ldvt); - return 0; + return; } ktemp = ctot[1] + 1; dgemm_("N", "N", k, &nlp1, &ktemp, &c_b13, &q[q_dim1 + 1], ldq, &vt2[ @@ -1038,7 +1038,7 @@ f"> */ vt2[ktemp + nlp2 * vt2_dim1], ldvt2, &c_b26, &vt[nlp2 * vt_dim1 + 1], ldvt); - return 0; + return; /* End of DLASD3 */ diff --git a/lapack-netlib/SRC/dlasd4.c b/lapack-netlib/SRC/dlasd4.c index e0c3a23e18..a19c9d997b 100644 --- a/lapack-netlib/SRC/dlasd4.c +++ b/lapack-netlib/SRC/dlasd4.c @@ -663,7 +663,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd4_(integer *n, integer *i__, doublereal *d__, +/* Subroutine */ void dlasd4_(integer *n, integer *i__, doublereal *d__, doublereal *z__, doublereal *delta, doublereal *rho, doublereal * sigma, doublereal *work, integer *info) { @@ -681,7 +681,7 @@ f"> */ doublereal dtisq; logical swtch; doublereal dtnsq; - extern /* Subroutine */ int dlaed6_(integer *, logical *, doublereal *, + extern /* Subroutine */ void dlaed6_(integer *, logical *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *) , dlasd5_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -728,11 +728,11 @@ f"> */ *sigma = sqrt(d__[1] * d__[1] + *rho * z__[1] * z__[1]); delta[1] = 1.; work[1] = 1.; - return 0; + return; } if (*n == 2) { dlasd5_(i__, &d__[1], &z__[1], &delta[1], rho, sigma, &work[1]); - return 0; + return; } /* Compute machine epsilon */ @@ -1661,7 +1661,7 @@ f"> */ } L240: - return 0; + return; /* End of DLASD4 */ diff --git a/lapack-netlib/SRC/dlasd5.c b/lapack-netlib/SRC/dlasd5.c index efbb03073f..aad2836af2 100644 --- a/lapack-netlib/SRC/dlasd5.c +++ b/lapack-netlib/SRC/dlasd5.c @@ -626,7 +626,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd5_(integer *i__, doublereal *d__, doublereal *z__, +/* Subroutine */ void dlasd5_(integer *i__, doublereal *d__, doublereal *z__, doublereal *delta, doublereal *rho, doublereal *dsigma, doublereal * work) { @@ -733,7 +733,7 @@ f"> */ /* DELTA( 1 ) = DELTA( 1 ) / TEMP */ /* DELTA( 2 ) = DELTA( 2 ) / TEMP */ } - return 0; + return; /* End of DLASD5 */ diff --git a/lapack-netlib/SRC/dlasd6.c b/lapack-netlib/SRC/dlasd6.c index ba2465acf6..46b1c968ef 100644 --- a/lapack-netlib/SRC/dlasd6.c +++ b/lapack-netlib/SRC/dlasd6.c @@ -827,7 +827,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd6_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void dlasd6_(integer *icompq, integer *nl, integer *nr, integer *sqre, doublereal *d__, doublereal *vf, doublereal *vl, doublereal *alpha, doublereal *beta, integer *idxq, integer *perm, integer *givptr, integer *givcol, integer *ldgcol, doublereal *givnum, @@ -842,10 +842,10 @@ f"> */ /* Local variables */ integer idxc, idxp, ivfw, ivlw, i__, m, n; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer n1, n2; - extern /* Subroutine */ int dlasd7_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasd7_(integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, @@ -855,7 +855,7 @@ f"> */ doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iw; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlamrg_(integer *, integer *, doublereal *, integer *, integer *, integer *); @@ -918,7 +918,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD6", &i__1, (ftnlen)6); - return 0; + return; } /* The following values are for bookkeeping purposes only. They are */ @@ -967,7 +967,7 @@ f"> */ /* Report the possible convergence failure. */ if (*info != 0) { - return 0; + return; } /* Save the poles if ICOMPQ = 1. */ @@ -987,7 +987,7 @@ f"> */ n2 = n - *k; dlamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]); - return 0; + return; /* End of DLASD6 */ diff --git a/lapack-netlib/SRC/dlasd7.c b/lapack-netlib/SRC/dlasd7.c index 799bcb003a..6ff2e9f111 100644 --- a/lapack-netlib/SRC/dlasd7.c +++ b/lapack-netlib/SRC/dlasd7.c @@ -791,7 +791,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd7_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void dlasd7_(integer *icompq, integer *nl, integer *nr, integer *sqre, integer *k, doublereal *d__, doublereal *z__, doublereal *zw, doublereal *vf, doublereal *vfw, doublereal *vl, doublereal *vlw, doublereal *alpha, doublereal *beta, doublereal * @@ -805,17 +805,18 @@ f"> */ /* Local variables */ integer idxi, idxj; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer i__, j, m, n, idxjp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer jprev, k2; doublereal z1; extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *); integer jp; - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal hlftol, eps, tau, tol; integer nlp1, nlp2; @@ -873,7 +874,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD7", &i__1, (ftnlen)6); - return 0; + return; } nlp1 = *nl + 1; @@ -1128,7 +1129,7 @@ f"> */ i__1 = n - 1; dcopy_(&i__1, &vlw[2], &c__1, &vl[2], &c__1); - return 0; + return; /* End of DLASD7 */ diff --git a/lapack-netlib/SRC/dlasd8.c b/lapack-netlib/SRC/dlasd8.c index 415a93c4a0..e3e771c946 100644 --- a/lapack-netlib/SRC/dlasd8.c +++ b/lapack-netlib/SRC/dlasd8.c @@ -681,7 +681,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasd8_(integer *icompq, integer *k, doublereal *d__, +/* Subroutine */ void dlasd8_(integer *icompq, integer *k, doublereal *d__, doublereal *z__, doublereal *vf, doublereal *vl, doublereal *difl, doublereal *difr, integer *lddifr, doublereal *dsigma, doublereal * work, integer *info) @@ -697,18 +697,18 @@ f"> */ extern doublereal dnrm2_(integer *, doublereal *, integer *); integer iwk2i, iwk3i, i__, j; doublereal diflj, difrj, dsigj; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamc3_(doublereal *, doublereal *); - extern /* Subroutine */ int dlasd4_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlasd4_(integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); doublereal dj; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer - *, doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal dsigjp, rho; integer iwk1, iwk2, iwk3; @@ -749,7 +749,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASD8", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -761,7 +761,7 @@ f"> */ difl[2] = 1.; difr[(difr_dim1 << 1) + 1] = 1.; } - return 0; + return; } /* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can */ @@ -816,7 +816,7 @@ f"> */ /* If the root finder fails, report the convergence failure. */ if (*info != 0) { - return 0; + return; } work[iwk3i + j] = work[iwk3i + j] * work[j] * work[iwk2i + j]; difl[j] = -work[j]; @@ -883,7 +883,7 @@ f"> */ dcopy_(k, &work[iwk2], &c__1, &vf[1], &c__1); dcopy_(k, &work[iwk3], &c__1, &vl[1], &c__1); - return 0; + return; /* End of DLASD8 */ diff --git a/lapack-netlib/SRC/dlasda.c b/lapack-netlib/SRC/dlasda.c index 72f9d55f39..f9993150da 100644 --- a/lapack-netlib/SRC/dlasda.c +++ b/lapack-netlib/SRC/dlasda.c @@ -789,7 +789,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasda_(integer *icompq, integer *smlsiz, integer *n, +/* Subroutine */ void dlasda_(integer *icompq, integer *smlsiz, integer *n, integer *sqre, doublereal *d__, doublereal *e, doublereal *u, integer *ldu, doublereal *vt, integer *k, doublereal *difl, doublereal *difr, doublereal *z__, doublereal *poles, integer *givptr, integer *givcol, @@ -807,23 +807,24 @@ f"> */ integer idxq, nlvl, i__, j, m; doublereal alpha; integer inode, ndiml, ndimr, idxqi, itemp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer sqrei, i1; - extern /* Subroutine */ int dlasd6_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasd6_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer ic, nwork1, lf, nd, nwork2, ll, nl, vf, nr, vl; - extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void dlasdq_(char *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dlasdt_(integer *, integer *, integer *, integer *, integer *, integer *, integer *), dlaset_( char *, integer *, integer *, doublereal *, doublereal *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer im1, smlszp, ncc, nlf, nrf, vfi, iwk, vli, lvl, nru, ndb1, nlp1, lvl2, nrp1; @@ -895,7 +896,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASDA", &i__1, (ftnlen)6); - return 0; + return; } m = *n + *sqre; @@ -912,7 +913,7 @@ f"> */ , ldu, &u[u_offset], ldu, &u[u_offset], ldu, &work[1], info); } - return 0; + return; } /* Book-keeping and set up the computation tree. */ @@ -979,7 +980,7 @@ f"> */ ; } if (*info != 0) { - return 0; + return; } i__2 = nl; for (j = 1; j <= i__2; ++j) { @@ -1015,7 +1016,7 @@ f"> */ ; } if (*info != 0) { - return 0; + return; } i__2 = nr; for (j = 1; j <= i__2; ++j) { @@ -1027,7 +1028,7 @@ f"> */ /* Now conquer each subproblem bottom-up. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); for (lvl = nlvl; lvl >= 1; --lvl) { lvl2 = (lvl << 1) - 1; @@ -1039,7 +1040,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1080,14 +1081,14 @@ f"> */ &s[j], &work[nwork1], &iwork[iwk], info); } if (*info != 0) { - return 0; + return; } /* L40: */ } /* L50: */ } - return 0; + return; /* End of DLASDA */ diff --git a/lapack-netlib/SRC/dlasdq.c b/lapack-netlib/SRC/dlasdq.c index cee659c399..054dabbacc 100644 --- a/lapack-netlib/SRC/dlasdq.c +++ b/lapack-netlib/SRC/dlasdq.c @@ -724,7 +724,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasdq_(char *uplo, integer *sqre, integer *n, integer * +/* Subroutine */ void dlasdq_(char *uplo, integer *sqre, integer *n, integer * ncvt, integer *nru, integer *ncc, doublereal *d__, doublereal *e, doublereal *vt, integer *ldvt, doublereal *u, integer *ldu, doublereal *c__, integer *ldc, doublereal *work, integer *info) @@ -739,14 +739,15 @@ f"> */ integer sqre1, i__, j; doublereal r__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer * , doublereal *, integer *); integer iuplo; doublereal cs, sn; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *), xerbla_(char *, - integer *, ftnlen), dbdsqr_(char *, integer *, integer *, integer + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dbdsqr_(char *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -810,10 +811,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASDQ", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* ROTATE is true if any singular vectors desired, false otherwise */ @@ -953,7 +954,7 @@ f"> */ /* L40: */ } - return 0; + return; /* End of DLASDQ */ diff --git a/lapack-netlib/SRC/dlasdt.c b/lapack-netlib/SRC/dlasdt.c index 54259a2d6c..d651ab090c 100644 --- a/lapack-netlib/SRC/dlasdt.c +++ b/lapack-netlib/SRC/dlasdt.c @@ -614,7 +614,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasdt_(integer *n, integer *lvl, integer *nd, integer * +/* Subroutine */ void dlasdt_(integer *n, integer *lvl, integer *nd, integer * inode, integer *ndiml, integer *ndimr, integer *msub) { /* System generated locals */ @@ -678,7 +678,7 @@ f"> */ } *nd = (llst << 1) - 1; - return 0; + return; /* End of DLASDT */ diff --git a/lapack-netlib/SRC/dlaset.c b/lapack-netlib/SRC/dlaset.c index 060fefbee4..7777c30598 100644 --- a/lapack-netlib/SRC/dlaset.c +++ b/lapack-netlib/SRC/dlaset.c @@ -620,7 +620,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlaset_(char *uplo, integer *m, integer *n, doublereal * +/* Subroutine */ void dlaset_(char *uplo, integer *m, integer *n, doublereal * alpha, doublereal *beta, doublereal *a, integer *lda) { /* System generated locals */ @@ -701,7 +701,7 @@ f"> */ /* L70: */ } - return 0; + return; /* End of DLASET */ diff --git a/lapack-netlib/SRC/dlasq1.c b/lapack-netlib/SRC/dlasq1.c index aaafa28d94..d77f5653e9 100644 --- a/lapack-netlib/SRC/dlasq1.c +++ b/lapack-netlib/SRC/dlasq1.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlasq1_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dlasq1_(integer *n, doublereal *d__, doublereal *e, doublereal *work, integer *info) { /* System generated locals */ @@ -631,22 +631,23 @@ f"> */ doublereal d__1, d__2, d__3; /* Local variables */ - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer i__; doublereal scale; integer iinfo; doublereal sigmn; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal sigmx; - extern /* Subroutine */ int dlasq2_(integer *, doublereal *, integer *); + extern /* Subroutine */ void dlasq2_(integer *, doublereal *, integer *); extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlasrt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlasrt_( char *, integer *, doublereal *, integer *); doublereal eps; @@ -671,17 +672,17 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("DLASQ1", &i__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { - return 0; + return; } else if (*n == 1) { d__[1] = abs(d__[1]); - return 0; + return; } else if (*n == 2) { dlas2_(&d__[1], &e[1], &d__[2], &sigmn, &sigmx); d__[1] = sigmx; d__[2] = sigmn; - return 0; + return; } /* Estimate the largest singular value. */ @@ -701,7 +702,7 @@ f"> */ if (sigmx == 0.) { dlasrt_("D", n, &d__[1], &iinfo); - return 0; + return; } i__1 = *n; @@ -762,7 +763,7 @@ f"> */ dlascl_("G", &c__0, &c__0, &scale, &sigmx, n, &c__1, &e[1], n, &iinfo); } - return 0; + return; /* End of DLASQ1 */ diff --git a/lapack-netlib/SRC/dlasq2.c b/lapack-netlib/SRC/dlasq2.c index b349edd99a..4958580f27 100644 --- a/lapack-netlib/SRC/dlasq2.c +++ b/lapack-netlib/SRC/dlasq2.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasq2_(integer *n, doublereal *z__, integer *info) +/* Subroutine */ void dlasq2_(integer *n, doublereal *z__, integer *info) { /* System generated locals */ integer i__1, i__2, i__3; @@ -652,7 +652,7 @@ f"> */ integer iinfo; doublereal tempe, tempq; integer i0, i1, i4, n0, n1, ttype; - extern /* Subroutine */ int dlasq3_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlasq3_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, logical *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, @@ -666,7 +666,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlasrt_(char *, integer *, doublereal *, integer *); doublereal dn1, dn2, dee, eps, tau, tol; integer ipn4; @@ -700,9 +700,9 @@ f"> */ if (*n < 0) { *info = -1; xerbla_("DLASQ2", &c__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { - return 0; + return; } else if (*n == 1) { /* 1-by-1 case. */ @@ -711,7 +711,7 @@ f"> */ *info = -201; xerbla_("DLASQ2", &c__2, (ftnlen)6); } - return 0; + return; } else if (*n == 2) { /* 2-by-2 case. */ @@ -719,15 +719,15 @@ f"> */ if (z__[1] < 0.) { *info = -201; xerbla_("DLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[2] < 0.) { *info = -202; xerbla_("DLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[3] < 0.) { *info = -203; xerbla_("DLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[3] > z__[1]) { d__ = z__[3]; z__[3] = z__[1]; @@ -748,7 +748,7 @@ f"> */ } z__[2] = z__[3]; z__[6] = z__[2] + z__[1]; - return 0; + return; } /* Check for negative data and compute sums of q's and e's. */ @@ -765,11 +765,11 @@ f"> */ if (z__[k] < 0.) { *info = -(k + 200); xerbla_("DLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[k + 1] < 0.) { *info = -(k + 201); xerbla_("DLASQ2", &c__2, (ftnlen)6); - return 0; + return; } d__ += z__[k]; e += z__[k + 1]; @@ -787,7 +787,7 @@ f"> */ if (z__[(*n << 1) - 1] < 0.) { *info = -((*n << 1) + 199); xerbla_("DLASQ2", &c__2, (ftnlen)6); - return 0; + return; } d__ += z__[(*n << 1) - 1]; /* Computing MAX */ @@ -805,7 +805,7 @@ f"> */ } dlasrt_("D", n, &z__[1], &iinfo); z__[(*n << 1) - 1] = d__; - return 0; + return; } trace = d__ + e; @@ -814,7 +814,7 @@ f"> */ if (trace == 0.) { z__[(*n << 1) - 1] = 0.; - return 0; + return; } /* Check whether the machine is IEEE conformable. */ @@ -950,7 +950,7 @@ f"> */ } if (sigma < 0.) { *info = 1; - return 0; + return; } /* Find last unreduced submatrix's top index I0, find QMAX and */ @@ -1137,7 +1137,7 @@ f"> */ z__[k * 2] = 0.; } } - return 0; + return; /* end IWHILB */ @@ -1148,7 +1148,7 @@ f"> */ } *info = 3; - return 0; + return; /* end IWHILA */ @@ -1181,7 +1181,7 @@ f"> */ i__1 = *n; z__[(*n << 1) + 4] = (doublereal) ndiv / (doublereal) (i__1 * i__1); z__[(*n << 1) + 5] = nfail * 100. / (doublereal) iter; - return 0; + return; /* End of DLASQ2 */ diff --git a/lapack-netlib/SRC/dlasq3.c b/lapack-netlib/SRC/dlasq3.c index c30f1faedb..cb65b63e77 100644 --- a/lapack-netlib/SRC/dlasq3.c +++ b/lapack-netlib/SRC/dlasq3.c @@ -689,7 +689,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlasq3_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ void dlasq3_(integer *i0, integer *n0, doublereal *z__, integer *pp, doublereal *dmin__, doublereal *sigma, doublereal *desig, doublereal *qmax, integer *nfail, integer *iter, integer *ndiv, logical *ieee, integer *ttype, doublereal *dmin1, doublereal *dmin2, @@ -703,7 +703,7 @@ f"> */ /* Local variables */ doublereal temp, s, t; integer j4; - extern /* Subroutine */ int dlasq4_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlasq4_(integer *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *), dlasq5_(integer *, integer *, doublereal *, @@ -745,7 +745,7 @@ f"> */ L10: if (*n0 < *i0) { - return 0; + return; } if (*n0 == *i0) { goto L20; @@ -941,7 +941,7 @@ f"> */ } *sigma = t; - return 0; + return; /* End of DLASQ3 */ diff --git a/lapack-netlib/SRC/dlasq4.c b/lapack-netlib/SRC/dlasq4.c index cab77310da..ce0322f0ba 100644 --- a/lapack-netlib/SRC/dlasq4.c +++ b/lapack-netlib/SRC/dlasq4.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasq4_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ void dlasq4_(integer *i0, integer *n0, doublereal *z__, integer *pp, integer *n0in, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2, doublereal *dn, doublereal *dn1, doublereal *dn2, doublereal *tau, integer *ttype, doublereal *g) @@ -694,7 +694,7 @@ f"> */ if (*dmin__ <= 0.) { *tau = -(*dmin__); *ttype = -1; - return 0; + return; } nn = (*n0 << 2) + *pp; @@ -747,7 +747,7 @@ f"> */ gam = *dn; a2 = 0.; if (z__[nn - 5] > z__[nn - 7]) { - return 0; + return; } b2 = z__[nn - 5] / z__[nn - 7]; np = nn - 9; @@ -755,11 +755,11 @@ f"> */ np = nn - (*pp << 1); gam = *dn1; if (z__[np - 4] > z__[np - 2]) { - return 0; + return; } a2 = z__[np - 4] / z__[np - 2]; if (z__[nn - 9] > z__[nn - 11]) { - return 0; + return; } b2 = z__[nn - 9] / z__[nn - 11]; np = nn - 13; @@ -775,7 +775,7 @@ f"> */ } b1 = b2; if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b2 *= z__[i4] / z__[i4 - 2]; a2 += b2; @@ -807,7 +807,7 @@ f"> */ b2 = z__[np - 6]; gam = *dn2; if (z__[np - 8] > b2 || z__[np - 4] > b1) { - return 0; + return; } a2 = z__[np - 8] / b2 * (z__[np - 4] / b1 + 1.); @@ -823,7 +823,7 @@ f"> */ } b1 = b2; if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b2 *= z__[i4] / z__[i4 - 2]; a2 += b2; @@ -865,7 +865,7 @@ f"> */ *ttype = -7; s = *dmin1 * .333; if (z__[nn - 5] > z__[nn - 7]) { - return 0; + return; } b1 = z__[nn - 5] / z__[nn - 7]; b2 = b1; @@ -876,7 +876,7 @@ f"> */ for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) { a2 = b1; if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b1 *= z__[i4] / z__[i4 - 2]; b2 += b1; @@ -922,7 +922,7 @@ f"> */ *ttype = -10; s = *dmin2 * .333; if (z__[nn - 5] > z__[nn - 7]) { - return 0; + return; } b1 = z__[nn - 5] / z__[nn - 7]; b2 = b1; @@ -932,7 +932,7 @@ f"> */ i__1 = (*i0 << 2) - 1 + *pp; for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) { if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b1 *= z__[i4] / z__[i4 - 2]; b2 += b1; @@ -970,7 +970,7 @@ f"> */ } *tau = s; - return 0; + return; /* End of DLASQ4 */ diff --git a/lapack-netlib/SRC/dlasq5.c b/lapack-netlib/SRC/dlasq5.c index 38b713c90d..cbf6af7e73 100644 --- a/lapack-netlib/SRC/dlasq5.c +++ b/lapack-netlib/SRC/dlasq5.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlasq5_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ void dlasq5_(integer *i0, integer *n0, doublereal *z__, integer *pp, doublereal *tau, doublereal *sigma, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2, doublereal *dn, doublereal * dnm1, doublereal *dnm2, logical *ieee, doublereal *eps) @@ -681,7 +681,7 @@ f"> */ /* Function Body */ if (*n0 - *i0 - 1 <= 0) { - return 0; + return; } dthresh = *eps * (*sigma + *tau); @@ -755,7 +755,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 2] = d__ + z__[j4 - 1]; if (d__ < 0.) { - return 0; + return; } else { z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]); d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]) - *tau; @@ -771,7 +771,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 3] = d__ + z__[j4]; if (d__ < 0.) { - return 0; + return; } else { z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]); d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]) - *tau; @@ -792,7 +792,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm2 + z__[j4p2]; if (*dnm2 < 0.) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau; @@ -804,7 +804,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm1 + z__[j4p2]; if (*dnm1 < 0.) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau; @@ -885,7 +885,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 2] = d__ + z__[j4 - 1]; if (d__ < 0.) { - return 0; + return; } else { z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]); d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]) - *tau; @@ -904,7 +904,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 3] = d__ + z__[j4]; if (d__ < 0.) { - return 0; + return; } else { z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]); d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]) - *tau; @@ -928,7 +928,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm2 + z__[j4p2]; if (*dnm2 < 0.) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau; @@ -940,7 +940,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm1 + z__[j4p2]; if (*dnm1 < 0.) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau; @@ -952,7 +952,7 @@ f"> */ z__[j4 + 2] = *dn; z__[(*n0 << 2) - *pp] = emin; - return 0; + return; /* End of DLASQ5 */ diff --git a/lapack-netlib/SRC/dlasq6.c b/lapack-netlib/SRC/dlasq6.c index 6f26df3dad..3dc8252115 100644 --- a/lapack-netlib/SRC/dlasq6.c +++ b/lapack-netlib/SRC/dlasq6.c @@ -627,7 +627,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlasq6_(integer *i0, integer *n0, doublereal *z__, +/* Subroutine */ void dlasq6_(integer *i0, integer *n0, doublereal *z__, integer *pp, doublereal *dmin__, doublereal *dmin1, doublereal *dmin2, doublereal *dn, doublereal *dnm1, doublereal *dnm2) { @@ -657,7 +657,7 @@ f"> */ /* Function Body */ if (*n0 - *i0 - 1 <= 0) { - return 0; + return; } safmin = dlamch_("Safe minimum"); @@ -761,7 +761,7 @@ f"> */ z__[j4 + 2] = *dn; z__[(*n0 << 2) - *pp] = emin; - return 0; + return; /* End of DLASQ6 */ diff --git a/lapack-netlib/SRC/dlasr.c b/lapack-netlib/SRC/dlasr.c index 2b24ecb32f..f4fc9cf2ba 100644 --- a/lapack-netlib/SRC/dlasr.c +++ b/lapack-netlib/SRC/dlasr.c @@ -708,7 +708,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlasr_(char *side, char *pivot, char *direct, integer *m, +/* Subroutine */ void dlasr_(char *side, char *pivot, char *direct, integer *m, integer *n, doublereal *c__, doublereal *s, doublereal *a, integer * lda) { @@ -761,13 +761,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("DLASR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (lsame_(side, "L")) { @@ -1003,7 +1003,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of DLASR */ diff --git a/lapack-netlib/SRC/dlasrt.c b/lapack-netlib/SRC/dlasrt.c index 784820de82..2567377092 100644 --- a/lapack-netlib/SRC/dlasrt.c +++ b/lapack-netlib/SRC/dlasrt.c @@ -597,7 +597,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlasrt_(char *id, integer *n, doublereal *d__, integer * +/* Subroutine */ void dlasrt_(char *id, integer *n, doublereal *d__, integer * info) { /* System generated locals */ @@ -644,13 +644,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLASRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } stkpnt = 1; @@ -817,7 +817,7 @@ f"> */ if (stkpnt > 0) { goto L10; } - return 0; + return; /* End of DLASRT */ diff --git a/lapack-netlib/SRC/dlassq.c b/lapack-netlib/SRC/dlassq.c index a88d377cf1..53f30d4ecc 100644 --- a/lapack-netlib/SRC/dlassq.c +++ b/lapack-netlib/SRC/dlassq.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlassq_(integer *n, doublereal *x, integer *incx, +/* Subroutine */ void dlassq_(integer *n, doublereal *x, integer *incx, doublereal *scale, doublereal *sumsq) { /* System generated locals */ @@ -658,7 +658,7 @@ f"> */ /* L10: */ } } - return 0; + return; /* End of DLASSQ */ diff --git a/lapack-netlib/SRC/dlasv2.c b/lapack-netlib/SRC/dlasv2.c index 92fb8a02f7..570f2a1114 100644 --- a/lapack-netlib/SRC/dlasv2.c +++ b/lapack-netlib/SRC/dlasv2.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlasv2_(doublereal *f, doublereal *g, doublereal *h__, +/* Subroutine */ void dlasv2_(doublereal *f, doublereal *g, doublereal *h__, doublereal *ssmin, doublereal *ssmax, doublereal *snr, doublereal * csr, doublereal *snl, doublereal *csl) { @@ -827,7 +827,7 @@ f"> */ *ssmax = d_sign(ssmax, &tsign); d__1 = tsign * d_sign(&c_b4, f) * d_sign(&c_b4, h__); *ssmin = d_sign(ssmin, &d__1); - return 0; + return; /* End of DLASV2 */ diff --git a/lapack-netlib/SRC/dlaswlq.c b/lapack-netlib/SRC/dlaswlq.c index a75055a008..83c2a08708 100644 --- a/lapack-netlib/SRC/dlaswlq.c +++ b/lapack-netlib/SRC/dlaswlq.c @@ -671,7 +671,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaswlq_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void dlaswlq_(integer *m, integer *n, integer *mb, integer * nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal *work, integer *lwork, integer *info) { @@ -680,7 +680,8 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dgelqt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dgelqt_( integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dtplqt_( integer *, integer *, integer *, integer *, doublereal *, integer @@ -737,15 +738,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("DLASWLQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -753,7 +754,7 @@ static integer c__0 = 0; if (*m >= *n || *nb <= *m || *nb >= *n) { dgelqt_(m, n, mb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*n - *m) % (*nb - *m); @@ -785,7 +786,7 @@ static integer c__0 = 0; } work[1] = (doublereal) (*m * *mb); - return 0; + return; /* End of DLASWLQ */ diff --git a/lapack-netlib/SRC/dlaswp.c b/lapack-netlib/SRC/dlaswp.c index ab2eb43ac9..06ce8e244c 100644 --- a/lapack-netlib/SRC/dlaswp.c +++ b/lapack-netlib/SRC/dlaswp.c @@ -624,7 +624,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlaswp_(integer *n, doublereal *a, integer *lda, integer +/* Subroutine */ void dlaswp_(integer *n, doublereal *a, integer *lda, integer *k1, integer *k2, integer *ipiv, integer *incx) { /* System generated locals */ @@ -665,7 +665,7 @@ f"> */ i2 = *k1; inc = -1; } else { - return 0; + return; } n32 = *n / 32 << 5; @@ -714,7 +714,7 @@ f"> */ } } - return 0; + return; /* End of DLASWP */ diff --git a/lapack-netlib/SRC/dlasy2.c b/lapack-netlib/SRC/dlasy2.c index b3e9062ee8..8d48d4d4f6 100644 --- a/lapack-netlib/SRC/dlasy2.c +++ b/lapack-netlib/SRC/dlasy2.c @@ -689,7 +689,7 @@ f"> */ /* > \ingroup doubleSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlasy2_(logical *ltranl, logical *ltranr, integer *isgn, +/* Subroutine */ void dlasy2_(logical *ltranl, logical *ltranr, integer *isgn, integer *n1, integer *n2, doublereal *tl, integer *ldtl, doublereal * tr, integer *ldtr, doublereal *b, integer *ldb, doublereal *scale, doublereal *x, integer *ldx, doublereal *xnorm, integer *info) @@ -715,7 +715,7 @@ f"> */ doublereal xmax; integer ipsv, jpsv, i__, j, k; logical bswap; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical xswap; @@ -758,7 +758,7 @@ f"> */ /* Quick return if possible */ if (*n1 == 0 || *n2 == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -794,7 +794,7 @@ f"> */ x[x_dim1 + 1] = b[b_dim1 + 1] * *scale / tau1; *xnorm = (d__1 = x[x_dim1 + 1], abs(d__1)); - return 0; + return; /* 1 by 2: */ /* TL11*[X11 X12] + ISGN*[X11 X12]*op[TR11 TR12] = [B11 B12] */ @@ -904,7 +904,7 @@ f"> */ , abs(d__2)); *xnorm = f2cmax(d__3,d__4); } - return 0; + return; /* 2 by 2: */ /* op[TL11 TL12]*[X11 X12] +ISGN* [X11 X12]*op[TR11 TR12] = [B11 B12] */ @@ -1047,7 +1047,7 @@ f"> */ /* Computing MAX */ d__1 = abs(tmp[0]) + abs(tmp[2]), d__2 = abs(tmp[1]) + abs(tmp[3]); *xnorm = f2cmax(d__1,d__2); - return 0; + return; /* End of DLASY2 */ diff --git a/lapack-netlib/SRC/dlasyf.c b/lapack-netlib/SRC/dlasyf.c index 11c66c40d5..b63353a0cf 100644 --- a/lapack-netlib/SRC/dlasyf.c +++ b/lapack-netlib/SRC/dlasyf.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dlasyf_(char *uplo, integer *n, integer *nb, integer *kb, +/* Subroutine */ void dlasyf_(char *uplo, integer *n, integer *nb, integer *kb, doublereal *a, integer *lda, integer *ipiv, doublereal *w, integer * ldw, integer *info) { @@ -703,12 +703,12 @@ f"> */ /* Local variables */ integer imax, jmax, j, k; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer @@ -1434,7 +1434,7 @@ f"> */ *kb = k - 1; } - return 0; + return; /* End of DLASYF */ diff --git a/lapack-netlib/SRC/dlasyf_aa.c b/lapack-netlib/SRC/dlasyf_aa.c index 5d8aea1f0b..2f4d34f3df 100644 --- a/lapack-netlib/SRC/dlasyf_aa.c +++ b/lapack-netlib/SRC/dlasyf_aa.c @@ -659,7 +659,7 @@ aa.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dlasyf_aa_(char *uplo, integer *j1, integer *m, integer +/* Subroutine */ void dlasyf_aa_(char *uplo, integer *j1, integer *m, integer *nb, doublereal *a, integer *lda, integer *ipiv, doublereal *h__, integer *ldh, doublereal *work) { @@ -669,10 +669,10 @@ aa.f"> */ /* Local variables */ integer j, k; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer @@ -681,7 +681,7 @@ aa.f"> */ integer *); integer i1, k1, i2, mj; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal piv; @@ -1045,7 +1045,7 @@ aa.f"> */ L40: ; } - return 0; + return; /* End of DLASYF_AA */ diff --git a/lapack-netlib/SRC/dlasyf_rk.c b/lapack-netlib/SRC/dlasyf_rk.c index 8a8682f474..c38bc94314 100644 --- a/lapack-netlib/SRC/dlasyf_rk.c +++ b/lapack-netlib/SRC/dlasyf_rk.c @@ -777,7 +777,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dlasyf_rk_(char *uplo, integer *n, integer *nb, integer +/* Subroutine */ void dlasyf_rk_(char *uplo, integer *n, integer *nb, integer *kb, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *w, integer *ldw, integer *info) { @@ -789,17 +789,17 @@ rk.f"> */ logical done; integer imax, jmax, j, k, p; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; @@ -1585,7 +1585,7 @@ rk.f"> */ } - return 0; + return; /* End of DLASYF_RK */ diff --git a/lapack-netlib/SRC/dlasyf_rook.c b/lapack-netlib/SRC/dlasyf_rook.c index 65e77e3714..172e0a7b45 100644 --- a/lapack-netlib/SRC/dlasyf_rook.c +++ b/lapack-netlib/SRC/dlasyf_rook.c @@ -699,7 +699,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dlasyf_rook_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void dlasyf_rook_(char *uplo, integer *n, integer *nb, integer *kb, doublereal *a, integer *lda, integer *ipiv, doublereal * w, integer *ldw, integer *info) { @@ -711,17 +711,17 @@ rook.f"> */ logical done; integer imax, jmax, j, k, p; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; @@ -1520,7 +1520,7 @@ rook.f"> */ *kb = k - 1; } - return 0; + return; /* End of DLASYF_ROOK */ diff --git a/lapack-netlib/SRC/dlat2s.c b/lapack-netlib/SRC/dlat2s.c index 6337247544..27fe21280f 100644 --- a/lapack-netlib/SRC/dlat2s.c +++ b/lapack-netlib/SRC/dlat2s.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlat2s_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dlat2s_(char *uplo, integer *n, doublereal *a, integer * lda, real *sa, integer *ldsa, integer *info) { /* System generated locals */ @@ -688,7 +688,7 @@ f"> */ } L50: - return 0; + return; /* End of DLAT2S */ diff --git a/lapack-netlib/SRC/dlat2s.f b/lapack-netlib/SRC/dlat2s.f index 3d00fe0a37..c926e99307 100644 --- a/lapack-netlib/SRC/dlat2s.f +++ b/lapack-netlib/SRC/dlat2s.f @@ -134,6 +134,9 @@ SUBROUTINE DLAT2S( UPLO, N, A, LDA, SA, LDSA, INFO ) LOGICAL LSAME EXTERNAL SLAMCH, LSAME * .. +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. * .. Executable Statements .. * RMAX = SLAMCH( 'O' ) @@ -146,7 +149,7 @@ SUBROUTINE DLAT2S( UPLO, N, A, LDA, SA, LDSA, INFO ) INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE @@ -157,7 +160,7 @@ SUBROUTINE DLAT2S( UPLO, N, A, LDA, SA, LDSA, INFO ) INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = REAL( A( I, J ) ) 30 CONTINUE 40 CONTINUE END IF diff --git a/lapack-netlib/SRC/dlatbs.c b/lapack-netlib/SRC/dlatbs.c index a8ebed6dfe..98e64489fb 100644 --- a/lapack-netlib/SRC/dlatbs.c +++ b/lapack-netlib/SRC/dlatbs.c @@ -755,7 +755,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatbs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void dlatbs_(char *uplo, char *trans, char *diag, char * normin, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *x, doublereal *scale, doublereal *cnorm, integer *info) { @@ -771,14 +771,14 @@ f"> */ integer imax; doublereal tmax, tjjs, xmax, grow, sumj; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer maind; extern logical lsame_(char *, char *); doublereal tscal, uscal; extern doublereal dasum_(integer *, doublereal *, integer *); integer jlast; - extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtbsv_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; @@ -838,13 +838,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLATBS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1414,7 +1414,7 @@ f"> */ dscal_(n, &d__1, &cnorm[1], &c__1); } - return 0; + return; /* End of DLATBS */ diff --git a/lapack-netlib/SRC/dlatbs.f b/lapack-netlib/SRC/dlatbs.f index 4b71d53994..6a812743b0 100644 --- a/lapack-netlib/SRC/dlatbs.f +++ b/lapack-netlib/SRC/dlatbs.f @@ -310,6 +310,7 @@ SUBROUTINE DLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -317,7 +318,6 @@ SUBROUTINE DLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, * SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/dlatdf.c b/lapack-netlib/SRC/dlatdf.c index f1c5901fd5..ef0d5c9ebb 100644 --- a/lapack-netlib/SRC/dlatdf.c +++ b/lapack-netlib/SRC/dlatdf.c @@ -687,7 +687,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatdf_(integer *ijob, integer *n, doublereal *z__, +/* Subroutine */ void dlatdf_(integer *ijob, integer *n, doublereal *z__, integer *ldz, doublereal *rhs, doublereal *rdsum, doublereal *rdscal, integer *ipiv, integer *jpiv) { @@ -701,25 +701,26 @@ f"> */ integer info; doublereal temp, work[32]; integer i__, j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern doublereal dasum_(integer *, doublereal *, integer *); doublereal pmone; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal sminu; integer iwork[8]; doublereal splus; - extern /* Subroutine */ int dgesc2_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dgesc2_(integer *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *); doublereal bm, bp; - extern /* Subroutine */ int dgecon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dgecon_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); doublereal xm[8], xp[8]; - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, - doublereal *, doublereal *), dlaswp_(integer *, doublereal *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, + doublereal *, doublereal *); + extern int dlaswp_(integer *, doublereal *, integer *, integer *, integer *, integer *, integer *); @@ -861,7 +862,7 @@ f"> */ } - return 0; + return; /* End of DLATDF */ diff --git a/lapack-netlib/SRC/dlatps.c b/lapack-netlib/SRC/dlatps.c index 34bf5e8234..95446856ac 100644 --- a/lapack-netlib/SRC/dlatps.c +++ b/lapack-netlib/SRC/dlatps.c @@ -742,7 +742,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatps_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void dlatps_(char *uplo, char *trans, char *diag, char * normin, integer *n, doublereal *ap, doublereal *x, doublereal *scale, doublereal *cnorm, integer *info) { @@ -758,16 +758,16 @@ f"> */ integer imax; doublereal tmax, tjjs, xmax, grow, sumj; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal tscal, uscal; extern doublereal dasum_(integer *, doublereal *, integer *); integer jlast; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); extern doublereal dlamch_(char *); integer ip; @@ -820,13 +820,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLATPS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1383,7 +1383,7 @@ f"> */ dscal_(n, &d__1, &cnorm[1], &c__1); } - return 0; + return; /* End of DLATPS */ diff --git a/lapack-netlib/SRC/dlatrd.c b/lapack-netlib/SRC/dlatrd.c index 9af43ac543..d1bf854d2c 100644 --- a/lapack-netlib/SRC/dlatrd.c +++ b/lapack-netlib/SRC/dlatrd.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatrd_(char *uplo, integer *n, integer *nb, doublereal * +/* Subroutine */ void dlatrd_(char *uplo, integer *n, integer *nb, doublereal * a, integer *lda, doublereal *e, doublereal *tau, doublereal *w, integer *ldw) { @@ -727,10 +727,10 @@ f"> */ integer *); integer i__; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), @@ -763,7 +763,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (lsame_(uplo, "U")) { @@ -909,7 +909,7 @@ f"> */ } } - return 0; + return; /* End of DLATRD */ diff --git a/lapack-netlib/SRC/dlatrs.c b/lapack-netlib/SRC/dlatrs.c index c5d0ea9fcc..13adcdfedf 100644 --- a/lapack-netlib/SRC/dlatrs.c +++ b/lapack-netlib/SRC/dlatrs.c @@ -752,7 +752,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatrs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void dlatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, doublereal *a, integer *lda, doublereal *x, doublereal *scale, doublereal *cnorm, integer *info) { @@ -768,16 +768,16 @@ f"> */ integer imax; doublereal tmax, tjjs, xmax, grow, sumj; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal tscal, uscal; extern doublereal dasum_(integer *, doublereal *, integer *); integer jlast; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtrsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); doublereal xj; @@ -833,13 +833,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLATRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1377,7 +1377,7 @@ f"> */ dscal_(n, &d__1, &cnorm[1], &c__1); } - return 0; + return; /* End of DLATRS */ diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f index 43f92911d7..be156bee20 100644 --- a/lapack-netlib/SRC/dlatrs.f +++ b/lapack-netlib/SRC/dlatrs.f @@ -264,8 +264,8 @@ SUBROUTINE DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX - DOUBLE PRECISION DASUM, DDOT, DLAMCH - EXTERNAL LSAME, IDAMAX, DASUM, DDOT, DLAMCH + DOUBLE PRECISION DASUM, DDOT, DLAMCH, DLANGE + EXTERNAL LSAME, IDAMAX, DASUM, DDOT, DLAMCH, DLANGE * .. * .. External Subroutines .. EXTERNAL DAXPY, DSCAL, DTRSV, XERBLA @@ -304,6 +304,7 @@ SUBROUTINE DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -311,7 +312,6 @@ SUBROUTINE DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -343,8 +343,67 @@ SUBROUTINE DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, IF( TMAX.LE.BIGNUM ) THEN TSCAL = ONE ELSE - TSCAL = ONE / ( SMLNUM*TMAX ) - CALL DSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = ONE / ( SMLNUM*TMAX ) + CALL DSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be represented +* as floating-point number. Find the offdiagonal entry A( I, J ) +* with the largest absolute value. If this entry is not +/- Infinity, +* use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + $ TMAX ) + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1, + $ SUMJ ), TMAX ) + END DO + END IF +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm without introducing Infinity +* in the summation + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + END IF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point entry. +* Rely on TRSV to propagate Inf and NaN. + CALL DTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c new file mode 100644 index 0000000000..4de1f53c37 --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.c @@ -0,0 +1,1265 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is DOUBLE PRECISION array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void dlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublereal *a, integer *lda, + doublereal *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, + integer *); + doublereal w[64]; + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, + integer *, doublereal *, doublereal *, integer *, doublereal *, + integer *, doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + extern /* Subroutine */ void dlatrs_(char *, char *, char *, char *, + integer *, doublereal *, integer *, doublereal *, doublereal *, + doublereal *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I+KK*LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DLATRS3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + dlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = dlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + dlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + dlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = dlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + dscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = dlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + dscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + dscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + dgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return; + +/* End of DLATRS3 */ + +} /* dlatrs3_ */ + diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f new file mode 100644 index 0000000000..b4a98bc78e --- /dev/null +++ b/lapack-netlib/SRC/dlatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is DOUBLE PRECISION array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLANGE, DLARMM + EXTERNAL DLAMCH, DLANGE, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLATRS, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks +* + NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I+KK*LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2-K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) +* + CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of DLATRS3 +* + END diff --git a/lapack-netlib/SRC/dlatrz.c b/lapack-netlib/SRC/dlatrz.c index c9c9032455..496021ba1a 100644 --- a/lapack-netlib/SRC/dlatrz.c +++ b/lapack-netlib/SRC/dlatrz.c @@ -649,7 +649,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatrz_(integer *m, integer *n, integer *l, doublereal * +/* Subroutine */ void dlatrz_(integer *m, integer *n, integer *l, doublereal * a, integer *lda, doublereal *tau, doublereal *work) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dlarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void dlarz_(char *, integer *, integer *, integer * , doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *), dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); @@ -685,14 +685,14 @@ f"> */ /* Function Body */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { tau[i__] = 0.; /* L10: */ } - return 0; + return; } for (i__ = *m; i__ >= 1; --i__) { @@ -714,7 +714,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of DLATRZ */ diff --git a/lapack-netlib/SRC/dlatsqr.c b/lapack-netlib/SRC/dlatsqr.c index 158974214b..52ab4b6ec7 100644 --- a/lapack-netlib/SRC/dlatsqr.c +++ b/lapack-netlib/SRC/dlatsqr.c @@ -673,7 +673,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void dlatsqr_(integer *m, integer *n, integer *mb, integer * nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal *work, integer *lwork, integer *info) { @@ -682,7 +682,8 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dgeqrt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dgeqrt_( integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dtpqrt_( integer *, integer *, integer *, integer *, doublereal *, integer @@ -738,15 +739,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("DLATSQR", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -754,7 +755,7 @@ static integer c__0 = 0; if (*mb <= *n || *mb >= *m) { dgeqrt_(m, n, nb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*m - *n) % (*mb - *n); @@ -786,7 +787,7 @@ static integer c__0 = 0; } work[1] = (doublereal) (*n * *nb); - return 0; + return; /* End of DLATSQR */ diff --git a/lapack-netlib/SRC/dlauu2.c b/lapack-netlib/SRC/dlauu2.c index ff83d379aa..a688bac1d1 100644 --- a/lapack-netlib/SRC/dlauu2.c +++ b/lapack-netlib/SRC/dlauu2.c @@ -617,7 +617,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlauu2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dlauu2_(char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* System generated locals */ @@ -627,10 +627,10 @@ f"> */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; @@ -667,13 +667,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAUU2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -721,7 +721,7 @@ f"> */ } } - return 0; + return; /* End of DLAUU2 */ diff --git a/lapack-netlib/SRC/dlauum.c b/lapack-netlib/SRC/dlauum.c index cc1af80e1a..e94f929ff5 100644 --- a/lapack-netlib/SRC/dlauum.c +++ b/lapack-netlib/SRC/dlauum.c @@ -618,7 +618,7 @@ f"> */ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dlauum_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dlauum_(char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* System generated locals */ @@ -626,15 +626,15 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlauu2_(char *, integer *, doublereal *, integer *, integer *); @@ -673,13 +673,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DLAUUM", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -757,7 +757,7 @@ f"> */ } } - return 0; + return; /* End of DLAUUM */ diff --git a/lapack-netlib/SRC/dopgtr.c b/lapack-netlib/SRC/dopgtr.c index c95dfcd99f..e59efa17d2 100644 --- a/lapack-netlib/SRC/dopgtr.c +++ b/lapack-netlib/SRC/dopgtr.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dopgtr_(char *uplo, integer *n, doublereal *ap, +/* Subroutine */ void dopgtr_(char *uplo, integer *n, doublereal *ap, doublereal *tau, doublereal *q, integer *ldq, doublereal *work, integer *info) { @@ -635,7 +635,7 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; logical upper; - extern /* Subroutine */ int dorg2l_(integer *, integer *, integer *, + extern /* Subroutine */ void dorg2l_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dorg2r_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -675,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DOPGTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -758,7 +758,7 @@ f"> */ &work[1], &iinfo); } } - return 0; + return; /* End of DOPGTR */ diff --git a/lapack-netlib/SRC/dopmtr.c b/lapack-netlib/SRC/dopmtr.c index 65df69df01..3497632f82 100644 --- a/lapack-netlib/SRC/dopmtr.c +++ b/lapack-netlib/SRC/dopmtr.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dopmtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void dopmtr_(char *side, char *uplo, char *trans, integer *m, integer *n, doublereal *ap, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work, integer *info) { @@ -672,7 +672,7 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); @@ -732,13 +732,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DOPMTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (upper) { @@ -854,7 +854,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of DOPMTR */ diff --git a/lapack-netlib/SRC/dorbdb.c b/lapack-netlib/SRC/dorbdb.c index fc84ad7d5d..b59b35b4f0 100644 --- a/lapack-netlib/SRC/dorbdb.c +++ b/lapack-netlib/SRC/dorbdb.c @@ -798,7 +798,7 @@ f"> */ /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dorbdb_(char *trans, char *signs, integer *m, integer *p, +/* Subroutine */ void dorbdb_(char *trans, char *signs, integer *m, integer *p, integer *q, doublereal *x11, integer *ldx11, doublereal *x12, integer *ldx12, doublereal *x21, integer *ldx21, doublereal *x22, integer *ldx22, doublereal *theta, doublereal *phi, doublereal *taup1, @@ -815,16 +815,16 @@ f"> */ integer lworkmin; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer lworkopt, i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal z1, z2, z3, z4; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int dlarfgp_(integer *, doublereal *, doublereal * + extern /* Subroutine */ void dlarfgp_(integer *, doublereal *, doublereal * , integer *, doublereal *); @@ -929,9 +929,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("xORBDB", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Handle column-major and row-major separately */ @@ -1386,7 +1386,7 @@ f"> */ } - return 0; + return; /* End of DORBDB */ diff --git a/lapack-netlib/SRC/dorbdb1.c b/lapack-netlib/SRC/dorbdb1.c index 07b91220e4..dbca828dad 100644 --- a/lapack-netlib/SRC/dorbdb1.c +++ b/lapack-netlib/SRC/dorbdb1.c @@ -715,7 +715,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dorbdb1_(integer *m, integer *p, integer *q, doublereal * +/* Subroutine */ void dorbdb1_(integer *m, integer *p, integer *q, doublereal * x11, integer *ldx11, doublereal *x21, integer *ldx21, doublereal * theta, doublereal *phi, doublereal *taup1, doublereal *taup2, doublereal *tauq1, doublereal *work, integer *lwork, integer *info) @@ -726,7 +726,7 @@ static integer c__1 = 1; doublereal d__1, d__2; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer lworkmin; extern doublereal dnrm2_(integer *, doublereal *, integer *); @@ -734,18 +734,18 @@ static integer c__1 = 1; doublereal c__; integer i__; doublereal s; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ilarf, llarf, childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int dorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void dorbdb5_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int dlarfgp_(integer *, doublereal *, doublereal * + extern /* Subroutine */ void dlarfgp_(integer *, doublereal *, doublereal * , integer *, doublereal *); @@ -816,9 +816,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DORBDB1", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., Q of X11 and X21 */ @@ -886,7 +886,7 @@ static integer c__1 = 1; } - return 0; + return; /* End of DORBDB1 */ diff --git a/lapack-netlib/SRC/dorbdb2.c b/lapack-netlib/SRC/dorbdb2.c index 6c1a7d0c0c..3ebea93b26 100644 --- a/lapack-netlib/SRC/dorbdb2.c +++ b/lapack-netlib/SRC/dorbdb2.c @@ -715,7 +715,7 @@ static doublereal c_b9 = -1.; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dorbdb2_(integer *m, integer *p, integer *q, doublereal * +/* Subroutine */ void dorbdb2_(integer *m, integer *p, integer *q, doublereal * x11, integer *ldx11, doublereal *x21, integer *ldx21, doublereal * theta, doublereal *phi, doublereal *taup1, doublereal *taup2, doublereal *tauq1, doublereal *work, integer *lwork, integer *info) @@ -726,7 +726,7 @@ static doublereal c_b9 = -1.; doublereal d__1, d__2; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer lworkmin; extern doublereal dnrm2_(integer *, doublereal *, integer *); @@ -734,18 +734,18 @@ static doublereal c_b9 = -1.; doublereal c__; integer i__; doublereal s; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ilarf, llarf, childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int dorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void dorbdb5_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int dlarfgp_(integer *, doublereal *, doublereal * + extern /* Subroutine */ void dlarfgp_(integer *, doublereal *, doublereal * , integer *, doublereal *); @@ -815,9 +815,9 @@ static doublereal c_b9 = -1.; if (*info != 0) { i__1 = -(*info); xerbla_("DORBDB2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., P of X11 and X21 */ @@ -901,7 +901,7 @@ static doublereal c_b9 = -1.; i__], &x21[i__ + (i__ + 1) * x21_dim1], ldx21, &work[ilarf]); } - return 0; + return; /* End of DORBDB2 */ diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f index 64e4645bcc..a0dacbb16d 100644 --- a/lapack-netlib/SRC/dorbdb2.f +++ b/lapack-netlib/SRC/dorbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is DOUBLE PRECISION array, dimension (P) +*> TAUP1 is DOUBLE PRECISION array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is DOUBLE PRECISION array, dimension (M-P) +*> TAUP2 is DOUBLE PRECISION array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/dorbdb3.c b/lapack-netlib/SRC/dorbdb3.c index 84629c6516..a22ac718f4 100644 --- a/lapack-netlib/SRC/dorbdb3.c +++ b/lapack-netlib/SRC/dorbdb3.c @@ -713,7 +713,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dorbdb3_(integer *m, integer *p, integer *q, doublereal * +/* Subroutine */ void dorbdb3_(integer *m, integer *p, integer *q, doublereal * x11, integer *ldx11, doublereal *x21, integer *ldx21, doublereal * theta, doublereal *phi, doublereal *taup1, doublereal *taup2, doublereal *tauq1, doublereal *work, integer *lwork, integer *info) @@ -724,7 +724,7 @@ static integer c__1 = 1; doublereal d__1, d__2; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer lworkmin; extern doublereal dnrm2_(integer *, doublereal *, integer *); @@ -732,18 +732,18 @@ static integer c__1 = 1; doublereal c__; integer i__; doublereal s; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ilarf, llarf, childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int dorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void dorbdb5_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int dlarfgp_(integer *, doublereal *, doublereal * + extern /* Subroutine */ void dlarfgp_(integer *, doublereal *, doublereal * , integer *, doublereal *); @@ -813,9 +813,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DORBDB3", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., M-P of X11 and X21 */ @@ -898,7 +898,7 @@ static integer c__1 = 1; i__], &x11[i__ + (i__ + 1) * x11_dim1], ldx11, &work[ilarf]); } - return 0; + return; /* End of DORBDB3 */ diff --git a/lapack-netlib/SRC/dorbdb4.c b/lapack-netlib/SRC/dorbdb4.c index 743a020bb2..d53ff709c6 100644 --- a/lapack-netlib/SRC/dorbdb4.c +++ b/lapack-netlib/SRC/dorbdb4.c @@ -725,7 +725,7 @@ static doublereal c_b5 = -1.; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dorbdb4_(integer *m, integer *p, integer *q, doublereal * +/* Subroutine */ void dorbdb4_(integer *m, integer *p, integer *q, doublereal * x11, integer *ldx11, doublereal *x21, integer *ldx21, doublereal * theta, doublereal *phi, doublereal *taup1, doublereal *taup2, doublereal *tauq1, doublereal *phantom, doublereal *work, integer * @@ -737,7 +737,7 @@ static doublereal c_b5 = -1.; doublereal d__1, d__2; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer lworkmin; extern doublereal dnrm2_(integer *, doublereal *, integer *); @@ -745,18 +745,18 @@ static doublereal c_b5 = -1.; doublereal c__; integer i__, j; doublereal s; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ilarf, llarf, childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int dorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void dorbdb5_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer iorbdb5, lorbdb5; - extern /* Subroutine */ int dlarfgp_(integer *, doublereal *, doublereal * + extern /* Subroutine */ void dlarfgp_(integer *, doublereal *, doublereal * , integer *, doublereal *); @@ -829,9 +829,9 @@ static doublereal c_b5 = -1.; if (*info != 0) { i__1 = -(*info); xerbla_("DORBDB4", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., M-Q of X11 and X21 */ @@ -960,7 +960,7 @@ static doublereal c_b5 = -1.; x21_dim1], ldx21, &work[ilarf]); } - return 0; + return; /* End of DORBDB4 */ diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f index a09568415d..08604be452 100644 --- a/lapack-netlib/SRC/dorbdb4.f +++ b/lapack-netlib/SRC/dorbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is DOUBLE PRECISION array, dimension (P) +*> TAUP1 is DOUBLE PRECISION array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is DOUBLE PRECISION array, dimension (M-P) +*> TAUP2 is DOUBLE PRECISION array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/dorbdb5.c b/lapack-netlib/SRC/dorbdb5.c index ae2b26bd8d..891153fccf 100644 --- a/lapack-netlib/SRC/dorbdb5.c +++ b/lapack-netlib/SRC/dorbdb5.c @@ -664,7 +664,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorbdb5_(integer *m1, integer *m2, integer *n, +/* Subroutine */ void dorbdb5_(integer *m1, integer *m2, integer *n, doublereal *x1, integer *incx1, doublereal *x2, integer *incx2, doublereal *q1, integer *ldq1, doublereal *q2, integer *ldq2, doublereal *work, integer *lwork, integer *info) @@ -675,7 +675,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j, childinfo; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dorbdb6_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dorbdb6_( integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -726,7 +727,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DORBDB5", &i__1, (ftnlen)7); - return 0; + return; } /* Project X onto the orthogonal complement of Q */ @@ -737,7 +738,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* If the projection is nonzero, then return */ if (dnrm2_(m1, &x1[1], incx1) != 0. || dnrm2_(m2, &x2[1], incx2) != 0.) { - return 0; + return; } /* Project each standard basis vector e_1,...,e_M1 in turn, stopping */ @@ -758,7 +759,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ ldq1, &q2[q2_offset], ldq2, &work[1], lwork, &childinfo); if (dnrm2_(m1, &x1[1], incx1) != 0. || dnrm2_(m2, &x2[1], incx2) != 0.) { - return 0; + return; } } @@ -780,11 +781,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ ldq1, &q2[q2_offset], ldq2, &work[1], lwork, &childinfo); if (dnrm2_(m1, &x1[1], incx1) != 0. || dnrm2_(m2, &x2[1], incx2) != 0.) { - return 0; + return; } } - return 0; + return; /* End of DORBDB5 */ diff --git a/lapack-netlib/SRC/dorbdb6.c b/lapack-netlib/SRC/dorbdb6.c index a7ac674ba0..70f1b46386 100644 --- a/lapack-netlib/SRC/dorbdb6.c +++ b/lapack-netlib/SRC/dorbdb6.c @@ -669,7 +669,7 @@ static doublereal c_b12 = -1.; /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorbdb6_(integer *m1, integer *m2, integer *n, +/* Subroutine */ void dorbdb6_(integer *m1, integer *m2, integer *n, doublereal *x1, integer *incx1, doublereal *x2, integer *incx2, doublereal *q1, integer *ldq1, doublereal *q2, integer *ldq2, doublereal *work, integer *lwork, integer *info) @@ -680,10 +680,11 @@ static doublereal c_b12 = -1.; /* Local variables */ integer i__; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, - doublereal *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen), dlassq_(integer *, doublereal *, integer *, + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal normsq1, normsq2, scl1, scl2, ssq1, ssq2; @@ -733,7 +734,7 @@ static doublereal c_b12 = -1.; if (*info != 0) { i__1 = -(*info); xerbla_("DORBDB6", &i__1, (ftnlen)7); - return 0; + return; } /* First, project X onto the orthogonal complement of Q's column */ @@ -786,11 +787,11 @@ static doublereal c_b12 = -1.; /* Otherwise, project again. */ if (normsq2 >= normsq1 * .01) { - return 0; + return; } if (normsq2 == 0.) { - return 0; + return; } normsq1 = normsq2; @@ -845,7 +846,7 @@ static doublereal c_b12 = -1.; } } - return 0; + return; /* End of DORBDB6 */ diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f index fac52f760d..45c8ba8a28 100644 --- a/lapack-netlib/SRC/dorbdb6.f +++ b/lapack-netlib/SRC/dorbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,15 +173,18 @@ SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01D0, REALONE = 1.0D0, + DOUBLE PRECISION ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) DOUBLE PRECISION NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 ) * .. * .. Local Scalars .. - INTEGER I - DOUBLE PRECISION NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + DOUBLE PRECISION EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH * .. * .. External Subroutines .. EXTERNAL DGEMV, DLASSQ, XERBLA @@ -210,17 +219,17 @@ SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL XERBLA( 'DORBDB6', -INFO ) RETURN END IF +* + EPS = DLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -238,27 +247,31 @@ SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -280,24 +293,22 @@ SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL DLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL DLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -306,4 +317,3 @@ SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * End of DORBDB6 * END - diff --git a/lapack-netlib/SRC/dorcsd.c b/lapack-netlib/SRC/dorcsd.c index 49c1e96020..a25001fb1a 100644 --- a/lapack-netlib/SRC/dorcsd.c +++ b/lapack-netlib/SRC/dorcsd.c @@ -810,7 +810,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorcsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void dorcsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, char *signs, integer *m, integer *p, integer *q, doublereal *x11, integer *ldx11, doublereal *x12, integer *ldx12, doublereal *x21, integer *ldx21, doublereal *x22, integer *ldx22, @@ -835,30 +835,31 @@ f"> */ integer childinfo, lbbcsdworkmin, itaup1, itaup2, itauq1, itauq2, lorbdbworkmin, lbbcsdworkopt; logical wantu1, wantu2; - extern /* Subroutine */ int dbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void dbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer ibbcsd, lorbdbworkopt; - extern /* Subroutine */ int dorbdb_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dorbdb_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer iorbdb, lorglqworkmin, lorgqrworkmin; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dlapmr_(logical *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlapmr_(logical *, integer *, integer *, doublereal *, integer *, integer *), dlapmt_(logical *, integer *, integer *, doublereal *, integer *, integer *); integer lorglqworkopt; - extern /* Subroutine */ int dorglq_(integer *, integer *, integer *, + extern /* Subroutine */ void dorglq_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer lorgqrworkopt, iorglq; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer iorgqr; @@ -988,7 +989,7 @@ f"> */ ldx12, &x22[x22_offset], ldx22, &theta[1], &v1t[v1t_offset], ldv1t, &v2t[v2t_offset], ldv2t, &u1[u1_offset], ldu1, &u2[ u2_offset], ldu2, &work[1], lwork, &iwork[1], info); - return 0; + return; } /* Work with permutation [ 0 I; I 0 ] * X * [ 0 I; I 0 ] if */ @@ -1008,7 +1009,7 @@ f"> */ u2_offset], ldu2, &u1[u1_offset], ldu1, &v2t[v2t_offset], ldv2t, &v1t[v1t_offset], ldv1t, &work[1], lwork, &iwork[1], info); - return 0; + return; } /* Compute workspace */ @@ -1118,9 +1119,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Transform to bidiagonal block form */ @@ -1284,7 +1285,7 @@ f"> */ } } - return 0; + return; /* End DORCSD */ diff --git a/lapack-netlib/SRC/dorcsd2by1.c b/lapack-netlib/SRC/dorcsd2by1.c index eb3e3a9e4a..fc44e083b0 100644 --- a/lapack-netlib/SRC/dorcsd2by1.c +++ b/lapack-netlib/SRC/dorcsd2by1.c @@ -746,7 +746,7 @@ by1.f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorcsd2by1_(char *jobu1, char *jobu2, char *jobv1t, +/* Subroutine */ void dorcsd2by1_(char *jobu1, char *jobu2, char *jobv1t, integer *m, integer *p, integer *q, doublereal *x11, integer *ldx11, doublereal *x21, integer *ldx21, doublereal *theta, doublereal *u1, integer *ldu1, doublereal *u2, integer *ldu2, doublereal *v1t, @@ -762,31 +762,32 @@ by1.f"> */ lworkmin, lworkopt, i__, j, r__; extern logical lsame_(char *, char *); integer childinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lorglqmin, lorgqrmin, lorglqopt, itaup1, itaup2, itauq1, lorgqropt; logical wantu1, wantu2; - extern /* Subroutine */ int dbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void dbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer ibbcsd, lbbcsd, iorbdb, lorbdb; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dlapmr_(logical *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlapmr_(logical *, integer *, integer *, doublereal *, integer *, integer *), dlapmt_(logical *, integer *, integer *, doublereal *, integer *, integer *), dorglq_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer iorglq; - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer lorglq, iorgqr, lorgqr; - extern /* Subroutine */ int dorbdb1_(integer *, integer *, integer *, + extern /* Subroutine */ void dorbdb1_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *), dorbdb2_(integer *, integer * @@ -1115,9 +1116,9 @@ by1.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORCSD2BY1", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { - return 0; + return; } lorgqr = *lwork - iorgqr + 1; lorglq = *lwork - iorglq + 1; @@ -1427,7 +1428,7 @@ by1.f"> */ } } - return 0; + return; /* End of DORCSD2BY1 */ diff --git a/lapack-netlib/SRC/dorcsd2by1.f b/lapack-netlib/SRC/dorcsd2by1.f index 06bf53db16..25fab0f33c 100644 --- a/lapack-netlib/SRC/dorcsd2by1.f +++ b/lapack-netlib/SRC/dorcsd2by1.f @@ -580,7 +580,7 @@ SUBROUTINE DORCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11, * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( JOBV1T, 'N', JOBU1, JOBU2, 'T', M, Q, P, THETA, - $ WORK(IPHI), V1T, LDV1T, DUM2, 1, U1, LDU1, U2, + $ WORK(IPHI), V1T, LDV1T, DUM1, 1, U1, LDU1, U2, $ LDU2, WORK(IB11D), WORK(IB11E), WORK(IB12D), $ WORK(IB12E), WORK(IB21D), WORK(IB21E), $ WORK(IB22D), WORK(IB22E), WORK(IBBCSD), LBBCSD, @@ -635,7 +635,7 @@ SUBROUTINE DORCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11, * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( 'N', JOBV1T, JOBU2, JOBU1, 'T', M, M-Q, M-P, - $ THETA, WORK(IPHI), DUM2, 1, V1T, LDV1T, U2, + $ THETA, WORK(IPHI), DUM1, 1, V1T, LDV1T, U2, $ LDU2, U1, LDU1, WORK(IB11D), WORK(IB11E), $ WORK(IB12D), WORK(IB12E), WORK(IB21D), $ WORK(IB21E), WORK(IB22D), WORK(IB22E), @@ -706,7 +706,7 @@ SUBROUTINE DORCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11, * Simultaneously diagonalize X11 and X21. * CALL DBBCSD( JOBU2, JOBU1, 'N', JOBV1T, 'N', M, M-P, M-Q, - $ THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM2, + $ THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM1, $ 1, V1T, LDV1T, WORK(IB11D), WORK(IB11E), $ WORK(IB12D), WORK(IB12E), WORK(IB21D), $ WORK(IB21E), WORK(IB22D), WORK(IB22E), diff --git a/lapack-netlib/SRC/dorg2l.c b/lapack-netlib/SRC/dorg2l.c index 65699489c8..d10fb0023d 100644 --- a/lapack-netlib/SRC/dorg2l.c +++ b/lapack-netlib/SRC/dorg2l.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorg2l_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorg2l_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -637,7 +637,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ii; @@ -676,13 +676,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORG2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns 1:n-k to columns of the unit matrix */ @@ -723,7 +723,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of DORG2L */ diff --git a/lapack-netlib/SRC/dorg2r.c b/lapack-netlib/SRC/dorg2r.c index 79ca4fb835..af2e984523 100644 --- a/lapack-netlib/SRC/dorg2r.c +++ b/lapack-netlib/SRC/dorg2r.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorg2r_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorg2r_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -637,9 +637,10 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + integer *, doublereal *, doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -674,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORG2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns k+1:n to columns of the unit matrix */ @@ -723,7 +724,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of DORG2R */ diff --git a/lapack-netlib/SRC/dorgbr.c b/lapack-netlib/SRC/dorgbr.c index 54c3b2e9e4..35697354bf 100644 --- a/lapack-netlib/SRC/dorgbr.c +++ b/lapack-netlib/SRC/dorgbr.c @@ -670,7 +670,7 @@ f"> */ /* > \ingroup doubleGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgbr_(char *vect, integer *m, integer *n, integer *k, +/* Subroutine */ void dorgbr_(char *vect, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -683,7 +683,8 @@ f"> */ integer iinfo; logical wantq; integer mn; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dorglq_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dorglq_( integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dorgqr_( integer *, integer *, integer *, doublereal *, integer *, @@ -766,17 +767,17 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { work[1] = (doublereal) lwkopt; - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1] = 1.; - return 0; + return; } if (wantq) { @@ -873,7 +874,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORGBR */ diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f index 1b242ff97f..7dfd03961e 100644 --- a/lapack-netlib/SRC/dorgbr.f +++ b/lapack-netlib/SRC/dorgbr.f @@ -232,7 +232,7 @@ SUBROUTINE DORGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF END IF END IF - LWKOPT = WORK( 1 ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/dorghr.c b/lapack-netlib/SRC/dorghr.c index 93b4f9cecc..b95d45875e 100644 --- a/lapack-netlib/SRC/dorghr.c +++ b/lapack-netlib/SRC/dorghr.c @@ -640,7 +640,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorghr_(integer *n, integer *ilo, integer *ihi, +/* Subroutine */ void dorghr_(integer *n, integer *ilo, integer *ihi, doublereal *a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -652,7 +652,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer lwkopt; @@ -703,16 +703,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGHR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.; - return 0; + return; } /* Shift the vectors which define the elementary reflectors one */ @@ -767,7 +767,7 @@ f"> */ ilo], &work[1], lwork, &iinfo); } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORGHR */ diff --git a/lapack-netlib/SRC/dorgl2.c b/lapack-netlib/SRC/dorgl2.c index 377727c47e..6059146e61 100644 --- a/lapack-netlib/SRC/dorgl2.c +++ b/lapack-netlib/SRC/dorgl2.c @@ -622,7 +622,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgl2_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorgl2_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -631,9 +631,10 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + integer *, doublereal *, doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -668,13 +669,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGL2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -722,7 +723,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of DORGL2 */ diff --git a/lapack-netlib/SRC/dorglq.c b/lapack-netlib/SRC/dorglq.c index 8829a8e6c3..09b943728b 100644 --- a/lapack-netlib/SRC/dorglq.c +++ b/lapack-netlib/SRC/dorglq.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorglq_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorglq_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -652,16 +652,17 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int dorgl2_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgl2_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -707,16 +708,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -835,7 +836,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DORGLQ */ diff --git a/lapack-netlib/SRC/dorgql.c b/lapack-netlib/SRC/dorgql.c index 435fe5da0e..374ee65a83 100644 --- a/lapack-netlib/SRC/dorgql.c +++ b/lapack-netlib/SRC/dorgql.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgql_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorgql_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -653,16 +653,17 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int dorg2l_(integer *, integer *, integer *, + extern /* Subroutine */ void dorg2l_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb, kk; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -719,15 +720,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } nbmin = 2; @@ -845,7 +846,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DORGQL */ diff --git a/lapack-netlib/SRC/dorgqr.c b/lapack-netlib/SRC/dorgqr.c index 5f8d68b75d..518d410b3d 100644 --- a/lapack-netlib/SRC/dorgqr.c +++ b/lapack-netlib/SRC/dorgqr.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgqr_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorgqr_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -653,16 +653,17 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int dorg2r_(integer *, integer *, integer *, + extern /* Subroutine */ void dorg2r_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb, ki, kk; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -708,16 +709,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -836,7 +837,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DORGQR */ diff --git a/lapack-netlib/SRC/dorgr2.c b/lapack-netlib/SRC/dorgr2.c index ca626ff31b..afc3fa7ba3 100644 --- a/lapack-netlib/SRC/dorgr2.c +++ b/lapack-netlib/SRC/dorgr2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgr2_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorgr2_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *info) { /* System generated locals */ @@ -633,7 +633,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ii; @@ -672,13 +672,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -724,7 +724,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of DORGR2 */ diff --git a/lapack-netlib/SRC/dorgrq.c b/lapack-netlib/SRC/dorgrq.c index 267f883aa5..fa3a054b66 100644 --- a/lapack-netlib/SRC/dorgrq.c +++ b/lapack-netlib/SRC/dorgrq.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgrq_(integer *m, integer *n, integer *k, doublereal * +/* Subroutine */ void dorgrq_(integer *m, integer *n, integer *k, doublereal * a, integer *lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -653,16 +653,17 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int dorgr2_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgr2_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ib, nb, ii, kk; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nx; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -719,15 +720,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } nbmin = 2; @@ -845,7 +846,7 @@ f"> */ } work[1] = (doublereal) iws; - return 0; + return; /* End of DORGRQ */ diff --git a/lapack-netlib/SRC/dorgtr.c b/lapack-netlib/SRC/dorgtr.c index 65035cafe6..d2cdcee027 100644 --- a/lapack-netlib/SRC/dorgtr.c +++ b/lapack-netlib/SRC/dorgtr.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorgtr_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dorgtr_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -652,7 +652,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dorgql_(integer *, integer *, integer *, + extern /* Subroutine */ void dorgql_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dorgqr_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -719,16 +719,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGTR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.; - return 0; + return; } if (upper) { @@ -799,7 +799,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORGTR */ diff --git a/lapack-netlib/SRC/dorgtsqr.c b/lapack-netlib/SRC/dorgtsqr.c index ab35ac4a99..e0d43d9f87 100644 --- a/lapack-netlib/SRC/dorgtsqr.c +++ b/lapack-netlib/SRC/dorgtsqr.c @@ -688,7 +688,7 @@ r.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dorgtsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void dorgtsqr_(integer *m, integer *n, integer *mb, integer * nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal *work, integer *lwork, integer *info) { @@ -696,17 +696,17 @@ r.f"> */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int dlamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void dlamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer lworkopt, j, iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lc, lw; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; integer ldc, nblocal; @@ -786,17 +786,17 @@ r.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1] = (doublereal) lworkopt; - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { work[1] = (doublereal) lworkopt; - return 0; + return; } /* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in */ @@ -830,7 +830,7 @@ r.f"> */ } work[1] = (doublereal) lworkopt; - return 0; + return; /* End of DORGTSQR */ diff --git a/lapack-netlib/SRC/dorgtsqr_row.c b/lapack-netlib/SRC/dorgtsqr_row.c index dfaee56c79..6bb13f718f 100644 --- a/lapack-netlib/SRC/dorgtsqr_row.c +++ b/lapack-netlib/SRC/dorgtsqr_row.c @@ -701,7 +701,7 @@ r_row.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dorgtsqr_row_(integer *m, integer *n, integer *mb, +/* Subroutine */ void dorgtsqr_row_(integer *m, integer *n, integer *mb, integer *nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal *work, integer *lwork, integer *info) { @@ -712,13 +712,13 @@ r_row.f"> */ integer jb_t__, itmp, lworkopt; doublereal dummy[1] /* was [1][1] */; integer ib_bottom__, ib, kb; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer mb1, mb2, m_plus_one__; logical lquery; integer num_all_row_blocks__, imb, knb, nblocal, kb_last__; - extern /* Subroutine */ int dlarfb_gett_(char *, integer *, integer *, + extern /* Subroutine */ void dlarfb_gett_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); @@ -780,17 +780,17 @@ r_row.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORGTSQR_ROW", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { work[1] = (doublereal) lworkopt; - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { work[1] = (doublereal) lworkopt; - return 0; + return; } /* (0) Set the upper-triangular part of the matrix A to zero and */ @@ -912,7 +912,7 @@ r_row.f"> */ } work[1] = (doublereal) lworkopt; - return 0; + return; /* End of DORGTSQR_ROW */ diff --git a/lapack-netlib/SRC/dorhr_col.c b/lapack-netlib/SRC/dorhr_col.c index 67ab4d40e3..03f00b7895 100644 --- a/lapack-netlib/SRC/dorhr_col.c +++ b/lapack-netlib/SRC/dorhr_col.c @@ -773,7 +773,7 @@ ol.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dorhr_col_(integer *m, integer *n, integer *nb, +/* Subroutine */ void dorhr_col_(integer *m, integer *n, integer *nb, doublereal *a, integer *lda, doublereal *t, integer *ldt, doublereal * d__, integer *info) { @@ -781,13 +781,13 @@ ol.f"> */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3, i__4; /* Local variables */ - extern /* Subroutine */ int dlaorhr_col_getrfnp_(integer *, integer *, + extern /* Subroutine */ void dlaorhr_col_getrfnp_(integer *, integer *, doublereal *, integer *, doublereal *, integer *); integer nplusone, i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -839,13 +839,13 @@ ol.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORHR_COL", &i__1, (ftnlen)9); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* On input, the M-by-N matrix A contains the orthogonal */ @@ -974,7 +974,7 @@ ol.f"> */ } - return 0; + return; /* End of DORHR_COL */ diff --git a/lapack-netlib/SRC/dorm22.c b/lapack-netlib/SRC/dorm22.c index af913ce7f0..9f3b9bf320 100644 --- a/lapack-netlib/SRC/dorm22.c +++ b/lapack-netlib/SRC/dorm22.c @@ -675,7 +675,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorm22_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dorm22_(char *side, char *trans, integer *m, integer *n, integer *n1, integer *n2, doublereal *q, integer *ldq, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -685,17 +685,17 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer nb, nq, nw; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork, lwkopt; logical lquery; @@ -771,16 +771,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORM22", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1] = 1.; - return 0; + return; } /* Degenerate cases (N1 = 0 or N2 = 0) are handled using DTRMM. */ @@ -789,12 +789,12 @@ f"> */ dtrmm_(side, "Upper", trans, "Non-Unit", m, n, &c_b10, &q[q_offset], ldq, &c__[c_offset], ldc); work[1] = 1.; - return 0; + return; } else if (*n2 == 0) { dtrmm_(side, "Lower", trans, "Non-Unit", m, n, &c_b10, &q[q_offset], ldq, &c__[c_offset], ldc); work[1] = 1.; - return 0; + return; } /* Compute the largest chunk size available from the workspace. */ @@ -979,7 +979,7 @@ f"> */ } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORM22 */ diff --git a/lapack-netlib/SRC/dorm2l.c b/lapack-netlib/SRC/dorm2l.c index 51df94f32e..9d62a36d5f 100644 --- a/lapack-netlib/SRC/dorm2l.c +++ b/lapack-netlib/SRC/dorm2l.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorm2l_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dorm2l_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *info) { @@ -682,7 +682,7 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); @@ -743,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORM2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -792,7 +792,7 @@ f"> */ a[nq - *k + i__ + i__ * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of DORM2L */ diff --git a/lapack-netlib/SRC/dorm2r.c b/lapack-netlib/SRC/dorm2r.c index d3f67f1f8f..7ad7af7df9 100644 --- a/lapack-netlib/SRC/dorm2r.c +++ b/lapack-netlib/SRC/dorm2r.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorm2r_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dorm2r_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *info) { @@ -682,7 +682,7 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); @@ -743,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORM2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -796,7 +796,7 @@ f"> */ a[i__ + i__ * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of DORM2R */ diff --git a/lapack-netlib/SRC/dormbr.c b/lapack-netlib/SRC/dormbr.c index 1dc15e573c..ba3a607dad 100644 --- a/lapack-netlib/SRC/dormbr.c +++ b/lapack-netlib/SRC/dormbr.c @@ -709,7 +709,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormbr_(char *vect, char *side, char *trans, integer *m, +/* Subroutine */ void dormbr_(char *vect, char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work, integer *lwork, integer *info) @@ -726,11 +726,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormlq_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical notran; - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical applyq; @@ -849,16 +849,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ work[1] = 1.; if (*m == 0 || *n == 0) { - return 0; + return; } if (applyq) { @@ -927,7 +927,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMBR */ diff --git a/lapack-netlib/SRC/dormhr.c b/lapack-netlib/SRC/dormhr.c index 3ee75ebef3..25f949cb99 100644 --- a/lapack-netlib/SRC/dormhr.c +++ b/lapack-netlib/SRC/dormhr.c @@ -692,7 +692,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormhr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormhr_(char *side, char *trans, integer *m, integer *n, integer *ilo, integer *ihi, doublereal *a, integer *lda, doublereal * tau, doublereal *c__, integer *ldc, doublereal *work, integer *lwork, integer *info) @@ -709,7 +709,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormqr_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; @@ -796,16 +796,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("DORMHR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nh == 0) { work[1] = 1.; - return 0; + return; } if (left) { @@ -824,7 +824,7 @@ f"> */ tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMHR */ diff --git a/lapack-netlib/SRC/dorml2.c b/lapack-netlib/SRC/dorml2.c index 7160eac6ab..a9882d6270 100644 --- a/lapack-netlib/SRC/dorml2.c +++ b/lapack-netlib/SRC/dorml2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dorml2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dorml2_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *info) { @@ -678,7 +678,7 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); @@ -739,13 +739,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORML2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -792,7 +792,7 @@ f"> */ a[i__ + i__ * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of DORML2 */ diff --git a/lapack-netlib/SRC/dormlq.c b/lapack-netlib/SRC/dormlq.c index 175860e487..ac319e054e 100644 --- a/lapack-netlib/SRC/dormlq.c +++ b/lapack-netlib/SRC/dormlq.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormlq_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -697,17 +697,18 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int dorml2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dorml2_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nq, nw; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -791,16 +792,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -890,7 +891,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMLQ */ diff --git a/lapack-netlib/SRC/dormql.c b/lapack-netlib/SRC/dormql.c index 3ccc9681a4..3d421e212b 100644 --- a/lapack-netlib/SRC/dormql.c +++ b/lapack-netlib/SRC/dormql.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormql_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormql_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -697,17 +697,18 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int dorm2l_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dorm2l_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer ib, nb, mi, ni; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nq, nw; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -793,15 +794,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -881,7 +882,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMQL */ diff --git a/lapack-netlib/SRC/dormqr.c b/lapack-netlib/SRC/dormqr.c index 3278bd7578..45281d7768 100644 --- a/lapack-netlib/SRC/dormqr.c +++ b/lapack-netlib/SRC/dormqr.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormqr_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -697,17 +697,18 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int dorm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dorm2r_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nq, nw; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -789,16 +790,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -882,7 +883,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMQR */ diff --git a/lapack-netlib/SRC/dormr2.c b/lapack-netlib/SRC/dormr2.c index 65c1f5695d..dbb930ac03 100644 --- a/lapack-netlib/SRC/dormr2.c +++ b/lapack-netlib/SRC/dormr2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormr2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormr2_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *info) { @@ -678,7 +678,7 @@ f"> */ /* Local variables */ logical left; integer i__; - extern /* Subroutine */ int dlarf_(char *, integer *, integer *, + extern /* Subroutine */ void dlarf_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); extern logical lsame_(char *, char *); @@ -739,13 +739,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -788,7 +788,7 @@ f"> */ a[i__ + (nq - *k + i__) * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of DORMR2 */ diff --git a/lapack-netlib/SRC/dormr3.c b/lapack-netlib/SRC/dormr3.c index 5a581e06e0..ba19bfd26e 100644 --- a/lapack-netlib/SRC/dormr3.c +++ b/lapack-netlib/SRC/dormr3.c @@ -687,7 +687,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dormr3_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormr3_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, doublereal *a, integer *lda, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work, integer *info) { @@ -698,7 +698,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void dlarz_(char *, integer *, integer *, integer * , doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer i1, i2, i3, ja, ic, jc, mi, ni, nq; @@ -759,13 +759,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMR3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -813,7 +813,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of DORMR3 */ diff --git a/lapack-netlib/SRC/dormrq.c b/lapack-netlib/SRC/dormrq.c index b399d9ce90..09f8dc535d 100644 --- a/lapack-netlib/SRC/dormrq.c +++ b/lapack-netlib/SRC/dormrq.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormrq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormrq_(char *side, char *trans, integer *m, integer *n, integer *k, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -697,17 +697,18 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int dormr2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormr2_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer ib, nb, mi, ni; - extern /* Subroutine */ int dlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer nq, nw; - extern /* Subroutine */ int dlarft_(char *, char *, integer *, integer *, - doublereal *, integer *, doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarft_(char *, char *, integer *, integer *, + doublereal *, integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical notran; @@ -795,15 +796,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -889,7 +890,7 @@ f"> */ } } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMRQ */ diff --git a/lapack-netlib/SRC/dormrz.c b/lapack-netlib/SRC/dormrz.c index ed38ebe7ec..ae00e8c9b4 100644 --- a/lapack-netlib/SRC/dormrz.c +++ b/lapack-netlib/SRC/dormrz.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dormrz_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dormrz_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, doublereal *a, integer *lda, doublereal *tau, doublereal *c__, integer *ldc, doublereal *work, integer *lwork, integer *info) @@ -718,14 +718,14 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3; - extern /* Subroutine */ int dormr3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormr3_(char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer ib, ic, ja, jc, nb, mi, ni, nq, nw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlarzb_(char *, char *, char *, char *, + extern /* Subroutine */ void dlarzb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dlarzt_(char *, char @@ -818,16 +818,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DORMRZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1] = 1.; - return 0; + return; } nbmin = 2; @@ -922,7 +922,7 @@ f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMRZ */ diff --git a/lapack-netlib/SRC/dormtr.c b/lapack-netlib/SRC/dormtr.c index 29c830c51a..6d9b2ef796 100644 --- a/lapack-netlib/SRC/dormtr.c +++ b/lapack-netlib/SRC/dormtr.c @@ -685,7 +685,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dormtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void dormtr_(char *side, char *uplo, char *trans, integer *m, integer *n, doublereal *a, integer *lda, doublereal *tau, doublereal * c__, integer *ldc, doublereal *work, integer *lwork, integer *info) { @@ -703,7 +703,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dormql_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dormql_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dormqr_(char *, char *, integer *, integer *, integer *, @@ -817,16 +817,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("DORMTR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nq == 1) { work[1] = 1.; - return 0; + return; } if (left) { @@ -860,7 +860,7 @@ f"> */ c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DORMTR */ diff --git a/lapack-netlib/SRC/dpbcon.c b/lapack-netlib/SRC/dpbcon.c index 0272bf665d..09fd9b8621 100644 --- a/lapack-netlib/SRC/dpbcon.c +++ b/lapack-netlib/SRC/dpbcon.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpbcon_(char *uplo, integer *n, integer *kd, doublereal * +/* Subroutine */ void dpbcon_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, doublereal *anorm, doublereal *rcond, doublereal * work, integer *iwork, integer *info) { @@ -657,16 +657,16 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; doublereal scalel; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatbs_(char *, char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal scaleu; @@ -711,7 +711,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -719,9 +719,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -785,7 +785,7 @@ f"> */ L20: - return 0; + return; /* End of DPBCON */ diff --git a/lapack-netlib/SRC/dpbequ.c b/lapack-netlib/SRC/dpbequ.c index 2b09d7254f..97fc2189a0 100644 --- a/lapack-netlib/SRC/dpbequ.c +++ b/lapack-netlib/SRC/dpbequ.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpbequ_(char *uplo, integer *n, integer *kd, doublereal * +/* Subroutine */ void dpbequ_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { @@ -686,7 +686,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -694,7 +694,7 @@ f"> */ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } if (upper) { @@ -731,7 +731,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L20: */ } @@ -750,7 +750,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of DPBEQU */ diff --git a/lapack-netlib/SRC/dpbrfs.c b/lapack-netlib/SRC/dpbrfs.c index d5fa710e13..1878fd5461 100644 --- a/lapack-netlib/SRC/dpbrfs.c +++ b/lapack-netlib/SRC/dpbrfs.c @@ -703,7 +703,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpbrfs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void dpbrfs_(char *uplo, integer *n, integer *kd, integer * nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * ferr, doublereal *berr, doublereal *work, integer *iwork, integer * @@ -721,7 +721,7 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dsbmv_(char *, integer *, integer *, + extern /* Subroutine */ void dsbmv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer @@ -729,13 +729,14 @@ f"> */ ; integer count; logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); doublereal xk; integer nz; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpbtrs_( char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal lstres, eps; @@ -793,7 +794,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -805,7 +806,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1012,7 +1013,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DPBRFS */ diff --git a/lapack-netlib/SRC/dpbstf.c b/lapack-netlib/SRC/dpbstf.c index cdbcc133ae..754af635fb 100644 --- a/lapack-netlib/SRC/dpbstf.c +++ b/lapack-netlib/SRC/dpbstf.c @@ -666,7 +666,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpbstf_(char *uplo, integer *n, integer *kd, doublereal * +/* Subroutine */ void dpbstf_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, integer *info) { /* System generated locals */ @@ -674,10 +674,10 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer j, m; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); logical upper; @@ -718,13 +718,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBSTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -850,11 +850,11 @@ f"> */ /* L40: */ } } - return 0; + return; L50: *info = j; - return 0; + return; /* End of DPBSTF */ diff --git a/lapack-netlib/SRC/dpbsv.c b/lapack-netlib/SRC/dpbsv.c index 0cac50884f..48a6609334 100644 --- a/lapack-netlib/SRC/dpbsv.c +++ b/lapack-netlib/SRC/dpbsv.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpbsv_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void dpbsv_(char *uplo, integer *n, integer *kd, integer * nrhs, doublereal *ab, integer *ldab, doublereal *b, integer *ldb, integer *info) { @@ -682,7 +682,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpbtrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpbtrf_( char *, integer *, integer *, doublereal *, integer *, integer *), dpbtrs_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -724,7 +725,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DPBSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U or A = L*L**T. */ @@ -738,7 +739,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ info); } - return 0; + return; /* End of DPBSV */ diff --git a/lapack-netlib/SRC/dpbsvx.c b/lapack-netlib/SRC/dpbsvx.c index 24d216ee1f..91314848db 100644 --- a/lapack-netlib/SRC/dpbsvx.c +++ b/lapack-netlib/SRC/dpbsvx.c @@ -854,7 +854,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpbsvx_(char *fact, char *uplo, integer *n, integer *kd, +/* Subroutine */ void dpbsvx_(char *fact, char *uplo, integer *n, integer *kd, integer *nrhs, doublereal *ab, integer *ldab, doublereal *afb, integer *ldafb, char *equed, doublereal *s, doublereal *b, integer * ldb, doublereal *x, integer *ldx, doublereal *rcond, doublereal *ferr, @@ -870,31 +870,32 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); doublereal scond, anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical equil, rcequ, upper; integer j1, j2; extern doublereal dlamch_(char *), dlansb_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dpbcon_(char *, integer *, integer *, + extern /* Subroutine */ void dpbcon_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dlaqsb_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dpbequ_(char *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dpbequ_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal bignum; - extern /* Subroutine */ int dpbrfs_(char *, integer *, integer *, integer + extern /* Subroutine */ void dpbrfs_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dpbtrf_(char *, integer *, integer *, doublereal *, integer *, integer *); integer infequ; - extern /* Subroutine */ int dpbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void dpbtrs_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal smlnum; @@ -994,7 +995,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1061,7 +1062,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -1113,7 +1114,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of DPBSVX */ diff --git a/lapack-netlib/SRC/dpbtf2.c b/lapack-netlib/SRC/dpbtf2.c index d4f4a53adf..d7cac9cbf4 100644 --- a/lapack-netlib/SRC/dpbtf2.c +++ b/lapack-netlib/SRC/dpbtf2.c @@ -657,7 +657,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpbtf2_(char *uplo, integer *n, integer *kd, doublereal * +/* Subroutine */ void dpbtf2_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, integer *info) { /* System generated locals */ @@ -665,10 +665,10 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); logical upper; @@ -709,13 +709,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -783,11 +783,11 @@ f"> */ /* L20: */ } } - return 0; + return; L30: *info = j; - return 0; + return; /* End of DPBTF2 */ diff --git a/lapack-netlib/SRC/dpbtrf.c b/lapack-netlib/SRC/dpbtrf.c index 6fca250113..ecb49b8022 100644 --- a/lapack-netlib/SRC/dpbtrf.c +++ b/lapack-netlib/SRC/dpbtrf.c @@ -659,7 +659,7 @@ f"> */ /* > Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 */ /* ===================================================================== */ -/* Subroutine */ int dpbtrf_(char *uplo, integer *n, integer *kd, doublereal * +/* Subroutine */ void dpbtrf_(char *uplo, integer *n, integer *kd, doublereal * ab, integer *ldab, integer *info) { /* System generated locals */ @@ -668,18 +668,19 @@ f"> */ /* Local variables */ doublereal work[1056] /* was [33][32] */; integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i2, i3; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *), dpbtf2_(char *, integer *, integer *, - doublereal *, integer *, integer *), dpotf2_(char *, + doublereal *, integer *, integer *); + extern int dpotf2_(char *, integer *, doublereal *, integer *, integer *); integer ib, nb, ii, jj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -717,13 +718,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1015,10 +1016,10 @@ f"> */ } } } - return 0; + return; L150: - return 0; + return; /* End of DPBTRF */ diff --git a/lapack-netlib/SRC/dpbtrs.c b/lapack-netlib/SRC/dpbtrs.c index a5e47405c6..02f96e03f4 100644 --- a/lapack-netlib/SRC/dpbtrs.c +++ b/lapack-netlib/SRC/dpbtrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpbtrs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void dpbtrs_(char *uplo, integer *n, integer *kd, integer * nrhs, doublereal *ab, integer *ldab, doublereal *b, integer *ldb, integer *info) { @@ -644,7 +644,7 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtbsv_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); logical upper; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -688,13 +688,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -735,7 +735,7 @@ f"> */ } } - return 0; + return; /* End of DPBTRS */ diff --git a/lapack-netlib/SRC/dpftrf.c b/lapack-netlib/SRC/dpftrf.c index fab9678a14..1552a222a4 100644 --- a/lapack-netlib/SRC/dpftrf.c +++ b/lapack-netlib/SRC/dpftrf.c @@ -713,7 +713,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpftrf_(char *transr, char *uplo, integer *n, doublereal +/* Subroutine */ void dpftrf_(char *transr, char *uplo, integer *n, doublereal *a, integer *info) { /* System generated locals */ @@ -724,7 +724,7 @@ f"> */ logical normaltransr; extern logical lsame_(char *, char *); logical lower; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsyrk_( char *, char *, integer *, integer *, doublereal *, doublereal *, @@ -760,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPFTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -807,7 +807,7 @@ f"> */ dpotrf_("L", &n1, a, n, info); if (*info > 0) { - return 0; + return; } dtrsm_("R", "L", "T", "N", &n2, &n1, &c_b12, a, n, &a[n1], n); dsyrk_("U", "N", &n2, &n1, &c_b15, &a[n1], n, &c_b12, &a[*n], @@ -825,7 +825,7 @@ f"> */ dpotrf_("L", &n1, &a[n2], n, info); if (*info > 0) { - return 0; + return; } dtrsm_("L", "L", "N", "N", &n1, &n2, &c_b12, &a[n2], n, a, n); dsyrk_("U", "T", &n2, &n1, &c_b15, a, n, &c_b12, &a[n1], n); @@ -848,7 +848,7 @@ f"> */ dpotrf_("U", &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } dtrsm_("L", "U", "T", "N", &n1, &n2, &c_b12, a, &n1, &a[n1 * n1], &n1); @@ -867,7 +867,7 @@ f"> */ dpotrf_("U", &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } dtrsm_("R", "U", "N", "N", &n2, &n1, &c_b12, &a[n2 * n2], &n2, a, &n2); @@ -899,7 +899,7 @@ f"> */ i__1 = *n + 1; dpotrf_("L", &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -924,7 +924,7 @@ f"> */ i__1 = *n + 1; dpotrf_("L", &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -954,7 +954,7 @@ f"> */ dpotrf_("U", &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } dtrsm_("L", "U", "T", "N", &k, &k, &c_b12, &a[k], &n1, &a[k * (k + 1)], &k); @@ -973,7 +973,7 @@ f"> */ dpotrf_("U", &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } dtrsm_("R", "U", "N", "N", &k, &k, &c_b12, &a[k * (k + 1)], & k, a, &k); @@ -989,7 +989,7 @@ f"> */ } - return 0; + return; /* End of DPFTRF */ diff --git a/lapack-netlib/SRC/dpftri.c b/lapack-netlib/SRC/dpftri.c index 763a06450e..df7208a889 100644 --- a/lapack-netlib/SRC/dpftri.c +++ b/lapack-netlib/SRC/dpftri.c @@ -705,7 +705,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpftri_(char *transr, char *uplo, integer *n, doublereal +/* Subroutine */ void dpftri_(char *transr, char *uplo, integer *n, doublereal *a, integer *info) { /* System generated locals */ @@ -715,18 +715,19 @@ f"> */ integer k; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical lower; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer n1, n2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int dlauum_(char *, integer *, doublereal *, - integer *, integer *), dtftri_(char *, char *, char *, + integer *, integer *); + extern void dtftri_(char *, char *, char *, integer *, doublereal *, integer *); @@ -754,20 +755,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ dtftri_(transr, uplo, "N", n, a, info); if (*info > 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -942,7 +943,7 @@ f"> */ } - return 0; + return; /* End of DPFTRI */ diff --git a/lapack-netlib/SRC/dpftrs.c b/lapack-netlib/SRC/dpftrs.c index 82305f70f4..4764e86a36 100644 --- a/lapack-netlib/SRC/dpftrs.c +++ b/lapack-netlib/SRC/dpftrs.c @@ -712,7 +712,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpftrs_(char *transr, char *uplo, integer *n, integer * +/* Subroutine */ void dpftrs_(char *transr, char *uplo, integer *n, integer * nrhs, doublereal *a, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ @@ -721,7 +721,7 @@ f"> */ /* Local variables */ logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtfsm_(char *, char *, char *, char *, char *, + extern /* Subroutine */ void dtfsm_(char *, char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); logical lower; @@ -762,13 +762,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPFTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* start execution: there are two triangular solves */ @@ -785,7 +785,7 @@ f"> */ ldb); } - return 0; + return; /* End of DPFTRS */ diff --git a/lapack-netlib/SRC/dpocon.c b/lapack-netlib/SRC/dpocon.c index a05dcdd132..4699fa3a72 100644 --- a/lapack-netlib/SRC/dpocon.c +++ b/lapack-netlib/SRC/dpocon.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpocon_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpocon_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *anorm, doublereal *rcond, doublereal *work, integer * iwork, integer *info) { @@ -646,10 +646,10 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -658,7 +658,7 @@ f"> */ doublereal scaleu; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatrs_(char *, char *, char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); char normin[1]; @@ -698,7 +698,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -706,9 +706,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -769,7 +769,7 @@ f"> */ } L20: - return 0; + return; /* End of DPOCON */ diff --git a/lapack-netlib/SRC/dpoequ.c b/lapack-netlib/SRC/dpoequ.c index 738587e88b..95635a7c41 100644 --- a/lapack-netlib/SRC/dpoequ.c +++ b/lapack-netlib/SRC/dpoequ.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpoequ_(integer *n, doublereal *a, integer *lda, +/* Subroutine */ void dpoequ_(integer *n, doublereal *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { /* System generated locals */ @@ -661,7 +661,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -669,7 +669,7 @@ f"> */ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } /* Find the minimum and maximum diagonal elements. */ @@ -697,7 +697,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L20: */ } @@ -716,7 +716,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of DPOEQU */ diff --git a/lapack-netlib/SRC/dpoequb.c b/lapack-netlib/SRC/dpoequb.c index 2352f05f90..87b3fbec48 100644 --- a/lapack-netlib/SRC/dpoequb.c +++ b/lapack-netlib/SRC/dpoequb.c @@ -627,7 +627,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpoequb_(integer *n, doublereal *a, integer *lda, +/* Subroutine */ void dpoequb_(integer *n, doublereal *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { /* System generated locals */ @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DPOEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -679,7 +679,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } base = dlamch_("B"); tmp = -.5 / log(base); @@ -709,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L20: */ } @@ -730,7 +730,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of DPOEQUB */ diff --git a/lapack-netlib/SRC/dporfs.c b/lapack-netlib/SRC/dporfs.c index 7a516ad095..91ca297f2d 100644 --- a/lapack-netlib/SRC/dporfs.c +++ b/lapack-netlib/SRC/dporfs.c @@ -697,7 +697,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dporfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dporfs_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * ferr, doublereal *berr, doublereal *work, integer *iwork, integer * @@ -715,12 +715,12 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; logical upper; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -728,7 +728,8 @@ f"> */ doublereal xk; integer nz; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpotrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpotrs_( char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal lstres, eps; @@ -784,7 +785,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPORFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -796,7 +797,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -993,7 +994,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DPORFS */ diff --git a/lapack-netlib/SRC/dporfsx.c b/lapack-netlib/SRC/dporfsx.c index 6393f39693..63408560e3 100644 --- a/lapack-netlib/SRC/dporfsx.c +++ b/lapack-netlib/SRC/dporfsx.c @@ -803,7 +803,7 @@ static integer c__1 = 1; /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dporfsx_(char *uplo, char *equed, integer *n, integer * +/* Subroutine */ void dporfsx_(char *uplo, char *equed, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, doublereal *s, doublereal *b, integer *ldb, doublereal *x, integer * ldx, doublereal *rcond, doublereal *berr, integer *n_err_bnds__, @@ -819,7 +819,7 @@ static integer c__1 = 1; /* Local variables */ doublereal illrcond_thresh__, unstable_thresh__; - extern /* Subroutine */ int dla_porfsx_extended_(integer *, char *, + extern /* Subroutine */ void dla_porfsx_extended_(integer *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, logical *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -837,7 +837,8 @@ static integer c__1 = 1; doublereal rcond_tmp__; integer prec_type__; extern doublereal dlamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpocon_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpocon_( char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern doublereal dlansy_(char *, char *, integer *, doublereal *, @@ -955,7 +956,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DPORFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -978,7 +979,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.; } } - return 0; + return; } /* Default to failure. */ @@ -1117,7 +1118,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of DPORFSX */ diff --git a/lapack-netlib/SRC/dposv.c b/lapack-netlib/SRC/dposv.c index 876f37ecc7..2e6b5019f2 100644 --- a/lapack-netlib/SRC/dposv.c +++ b/lapack-netlib/SRC/dposv.c @@ -639,7 +639,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doublePOsolve */ /* ===================================================================== */ -/* Subroutine */ int dposv_(char *uplo, integer *n, integer *nrhs, doublereal +/* Subroutine */ void dposv_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ @@ -647,9 +647,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpotrf_( - char *, integer *, doublereal *, integer *, integer *), - dpotrs_(char *, integer *, integer *, doublereal *, integer *, + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dpotrf_( + char *, integer *, doublereal *, integer *, integer *); + extern void dpotrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -688,7 +689,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DPOSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U or A = L*L**T. */ @@ -701,7 +702,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dpotrs_(uplo, n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); } - return 0; + return; /* End of DPOSV */ diff --git a/lapack-netlib/SRC/dposvx.c b/lapack-netlib/SRC/dposvx.c index 1d9340d1b4..bd56317d83 100644 --- a/lapack-netlib/SRC/dposvx.c +++ b/lapack-netlib/SRC/dposvx.c @@ -814,7 +814,7 @@ f"> */ /* > \ingroup doublePOsolve */ /* ===================================================================== */ -/* Subroutine */ int dposvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void dposvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, char *equed, doublereal *s, doublereal *b, integer *ldb, doublereal * x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal * @@ -833,25 +833,26 @@ f"> */ logical equil, rcequ; extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dpocon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dpocon_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer infequ; extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlaqsy_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlaqsy_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, char *), dpoequ_(integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *), dporfs_( char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - doublereal *, doublereal *, doublereal *, integer *, integer *), dpotrf_(char *, integer *, doublereal *, integer *, + doublereal *, doublereal *, doublereal *, integer *, integer *); + extern int dpotrf_(char *, integer *, doublereal *, integer *, integer *); doublereal smlnum; - extern /* Subroutine */ int dpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void dpotrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -948,7 +949,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -990,7 +991,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -1041,7 +1042,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of DPOSVX */ diff --git a/lapack-netlib/SRC/dposvxx.c b/lapack-netlib/SRC/dposvxx.c index 7a7a7b8aa4..9e7eeb14df 100644 --- a/lapack-netlib/SRC/dposvxx.c +++ b/lapack-netlib/SRC/dposvxx.c @@ -1000,7 +1000,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doublePOsolve */ /* ===================================================================== */ -/* Subroutine */ int dposvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void dposvxx_(char *fact, char *uplo, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, char *equed, doublereal *s, doublereal *b, integer *ldb, doublereal * x, integer *ldx, doublereal *rcond, doublereal *rpvgrw, doublereal * @@ -1024,16 +1024,16 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical equil, rcequ; extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer infequ; - extern /* Subroutine */ int dlaqsy_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlaqsy_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, char *), dpotrf_(char *, integer *, doublereal *, integer *, integer *); doublereal smlnum; - extern /* Subroutine */ int dpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void dpotrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *), dlascl2_(integer *, integer *, doublereal *, doublereal * , integer *), dpoequb_(integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *), dporfsx_( @@ -1149,7 +1149,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DPOSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1189,7 +1189,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = dla_porpvgrw_(uplo, info, &a[a_offset], lda, &af[ af_offset], ldaf, &work[1]); - return 0; + return; } } @@ -1218,7 +1218,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dlascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of DPOSVXX */ diff --git a/lapack-netlib/SRC/dpotf2.c b/lapack-netlib/SRC/dpotf2.c index a560a72111..b6a7328412 100644 --- a/lapack-netlib/SRC/dpotf2.c +++ b/lapack-netlib/SRC/dpotf2.c @@ -625,7 +625,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpotf2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpotf2_(char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* System generated locals */ @@ -636,10 +636,10 @@ f"> */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; @@ -677,13 +677,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -759,7 +759,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of DPOTF2 */ diff --git a/lapack-netlib/SRC/dpotrf.c b/lapack-netlib/SRC/dpotrf.c index dc4a608a08..2c09fd41ed 100644 --- a/lapack-netlib/SRC/dpotrf.c +++ b/lapack-netlib/SRC/dpotrf.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpotrf_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpotrf_(char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* System generated locals */ @@ -631,22 +631,22 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer jb, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dpotrf2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dpotrf2_(char *, integer *, doublereal *, integer *, integer *); @@ -679,13 +679,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -785,7 +785,7 @@ f"> */ *info = *info + j - 1; L40: - return 0; + return; /* End of DPOTRF */ diff --git a/lapack-netlib/SRC/dpotrf2.c b/lapack-netlib/SRC/dpotrf2.c index 08922f1390..5073fe1ae8 100644 --- a/lapack-netlib/SRC/dpotrf2.c +++ b/lapack-netlib/SRC/dpotrf2.c @@ -617,7 +617,7 @@ static doublereal c_b11 = -1.; /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpotrf2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpotrf2_(char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* System generated locals */ @@ -626,11 +626,11 @@ static doublereal c_b11 = -1.; /* Local variables */ extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer n1, n2; @@ -667,13 +667,13 @@ static doublereal c_b11 = -1.; if (*info != 0) { i__1 = -(*info); xerbla_("DPOTRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* N=1 case */ @@ -684,7 +684,7 @@ static doublereal c_b11 = -1.; if (a[a_dim1 + 1] <= 0. || disnan_(&a[a_dim1 + 1])) { *info = 1; - return 0; + return; } /* Factor */ @@ -702,7 +702,7 @@ static doublereal c_b11 = -1.; dpotrf2_(uplo, &n1, &a[a_dim1 + 1], lda, &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U */ @@ -721,7 +721,7 @@ static doublereal c_b11 = -1.; dpotrf2_(uplo, &n2, &a[n1 + 1 + (n1 + 1) * a_dim1], lda, &iinfo); if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } /* Compute the Cholesky factorization A = L*L**T */ @@ -740,11 +740,11 @@ static doublereal c_b11 = -1.; dpotrf2_(uplo, &n2, &a[n1 + 1 + (n1 + 1) * a_dim1], lda, &iinfo); if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } } } - return 0; + return; /* End of DPOTRF2 */ diff --git a/lapack-netlib/SRC/dpotri.c b/lapack-netlib/SRC/dpotri.c index 285575f3fd..c11d90d8da 100644 --- a/lapack-netlib/SRC/dpotri.c +++ b/lapack-netlib/SRC/dpotri.c @@ -604,7 +604,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpotri_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpotri_(char *uplo, integer *n, doublereal *a, integer * lda, integer *info) { /* System generated locals */ @@ -612,9 +612,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlauum_( - char *, integer *, doublereal *, integer *, integer *), - dtrtri_(char *, char *, integer *, doublereal *, integer *, + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dlauum_( + char *, integer *, doublereal *, integer *, integer *); + extern int dtrtri_(char *, char *, integer *, doublereal *, integer *, integer *); @@ -646,27 +647,27 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ dtrtri_(uplo, "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } /* Form inv(U) * inv(U)**T or inv(L)**T * inv(L). */ dlauum_(uplo, n, &a[a_offset], lda, info); - return 0; + return; /* End of DPOTRI */ diff --git a/lapack-netlib/SRC/dpotrs.c b/lapack-netlib/SRC/dpotrs.c index fcfc993d28..97ed17d616 100644 --- a/lapack-netlib/SRC/dpotrs.c +++ b/lapack-netlib/SRC/dpotrs.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup doublePOcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpotrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dpotrs_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { @@ -632,7 +632,7 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; @@ -675,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPOTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -712,7 +712,7 @@ f"> */ a_offset], lda, &b[b_offset], ldb); } - return 0; + return; /* End of DPOTRS */ diff --git a/lapack-netlib/SRC/dppcon.c b/lapack-netlib/SRC/dppcon.c index 1a09f5445f..c81be8bee4 100644 --- a/lapack-netlib/SRC/dppcon.c +++ b/lapack-netlib/SRC/dppcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dppcon_(char *uplo, integer *n, doublereal *ap, +/* Subroutine */ void dppcon_(char *uplo, integer *n, doublereal *ap, doublereal *anorm, doublereal *rcond, doublereal *work, integer * iwork, integer *info) { @@ -644,17 +644,18 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; doublereal scalel; extern integer idamax_(integer *, doublereal *, integer *); doublereal scaleu; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlatps_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlatps_( char *, char *, char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); doublereal ainvnm; @@ -691,7 +692,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -699,9 +700,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -760,7 +761,7 @@ f"> */ } L20: - return 0; + return; /* End of DPPCON */ diff --git a/lapack-netlib/SRC/dppequ.c b/lapack-netlib/SRC/dppequ.c index 380949cf32..9700a80bd3 100644 --- a/lapack-netlib/SRC/dppequ.c +++ b/lapack-netlib/SRC/dppequ.c @@ -625,7 +625,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dppequ_(char *uplo, integer *n, doublereal *ap, +/* Subroutine */ void dppequ_(char *uplo, integer *n, doublereal *ap, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { /* System generated locals */ @@ -667,7 +667,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -675,7 +675,7 @@ f"> */ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } /* Initialize SMIN and AMAX. */ @@ -731,7 +731,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L30: */ } @@ -750,7 +750,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of DPPEQU */ diff --git a/lapack-netlib/SRC/dpprfs.c b/lapack-netlib/SRC/dpprfs.c index 9eee3ee5bc..40e1677cd6 100644 --- a/lapack-netlib/SRC/dpprfs.c +++ b/lapack-netlib/SRC/dpprfs.c @@ -685,7 +685,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpprfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dpprfs_(char *uplo, integer *n, integer *nrhs, doublereal *ap, doublereal *afp, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) @@ -701,15 +701,15 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; - extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer ik, kk; extern doublereal dlamch_(char *); @@ -718,7 +718,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int dpptrs_(char *, integer *, integer *, + extern /* Subroutine */ void dpptrs_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *); doublereal eps; @@ -765,7 +765,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -777,7 +777,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -978,7 +978,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DPPRFS */ diff --git a/lapack-netlib/SRC/dppsv.c b/lapack-netlib/SRC/dppsv.c index 956e7ac8f1..e730dc071b 100644 --- a/lapack-netlib/SRC/dppsv.c +++ b/lapack-netlib/SRC/dppsv.c @@ -653,7 +653,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dppsv_(char *uplo, integer *n, integer *nrhs, doublereal +/* Subroutine */ void dppsv_(char *uplo, integer *n, integer *nrhs, doublereal *ap, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ @@ -661,7 +661,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpptrf_( char *, integer *, doublereal *, integer *), dpptrs_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -698,7 +699,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DPPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U or A = L*L**T. */ @@ -711,7 +712,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dpptrs_(uplo, n, nrhs, &ap[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of DPPSV */ diff --git a/lapack-netlib/SRC/dppsvx.c b/lapack-netlib/SRC/dppsvx.c index ccae54feb9..3a339ab87b 100644 --- a/lapack-netlib/SRC/dppsvx.c +++ b/lapack-netlib/SRC/dppsvx.c @@ -823,7 +823,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dppsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void dppsvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublereal *ap, doublereal *afp, char *equed, doublereal *s, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * rcond, doublereal *ferr, doublereal *berr, doublereal *work, integer * @@ -838,29 +838,29 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); doublereal scond, anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical equil, rcequ; extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; extern doublereal dlansp_(char *, char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dppcon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dppcon_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *), dlaqsp_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, char *); integer infequ; - extern /* Subroutine */ int dppequ_(char *, integer *, doublereal *, + extern /* Subroutine */ void dppequ_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dpprfs_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dpptrf_(char *, integer *, doublereal *, integer *); doublereal smlnum; - extern /* Subroutine */ int dpptrs_(char *, integer *, integer *, + extern /* Subroutine */ void dpptrs_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -949,7 +949,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -992,7 +992,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -1041,7 +1041,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of DPPSVX */ diff --git a/lapack-netlib/SRC/dpptrf.c b/lapack-netlib/SRC/dpptrf.c index bd13ca2537..7cfd1b9848 100644 --- a/lapack-netlib/SRC/dpptrf.c +++ b/lapack-netlib/SRC/dpptrf.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dpptrf_(char *uplo, integer *n, doublereal *ap, integer * +/* Subroutine */ void dpptrf_(char *uplo, integer *n, doublereal *ap, integer * info) { /* System generated locals */ @@ -643,14 +643,14 @@ f"> */ /* Local variables */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); - extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int dtpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); integer jc, jj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -682,13 +682,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -759,7 +759,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of DPPTRF */ diff --git a/lapack-netlib/SRC/dpptri.c b/lapack-netlib/SRC/dpptri.c index 4ec7a01848..be3e43847c 100644 --- a/lapack-netlib/SRC/dpptri.c +++ b/lapack-netlib/SRC/dpptri.c @@ -607,7 +607,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpptri_(char *uplo, integer *n, doublereal *ap, integer * +/* Subroutine */ void dpptri_(char *uplo, integer *n, doublereal *ap, integer * info) { /* System generated locals */ @@ -616,17 +616,18 @@ f"> */ /* Local variables */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); - extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); logical upper; integer jc, jj; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dtptri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dtptri_( char *, char *, integer *, doublereal *, integer *); doublereal ajj; integer jjn; @@ -657,20 +658,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ dtptri_(uplo, "Non-unit", n, &ap[1], info); if (*info > 0) { - return 0; + return; } if (upper) { @@ -711,7 +712,7 @@ f"> */ } } - return 0; + return; /* End of DPPTRI */ diff --git a/lapack-netlib/SRC/dpptrs.c b/lapack-netlib/SRC/dpptrs.c index 8ae33bf90b..ae96e067ff 100644 --- a/lapack-netlib/SRC/dpptrs.c +++ b/lapack-netlib/SRC/dpptrs.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpptrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dpptrs_(char *uplo, integer *n, integer *nrhs, doublereal *ap, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,9 +631,9 @@ f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int dtpsv_(char *, char *, char *, integer *, - doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dtpsv_(char *, char *, char *, integer *, + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -668,13 +668,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -715,7 +715,7 @@ f"> */ } } - return 0; + return; /* End of DPPTRS */ diff --git a/lapack-netlib/SRC/dpstf2.c b/lapack-netlib/SRC/dpstf2.c index c08e9bc0a4..ff677abefb 100644 --- a/lapack-netlib/SRC/dpstf2.c +++ b/lapack-netlib/SRC/dpstf2.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpstf2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpstf2_(char *uplo, integer *n, doublereal *a, integer * lda, integer *piv, integer *rank, doublereal *tol, doublereal *work, integer *info) { @@ -669,15 +669,15 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal dtemp; integer itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal dstop; logical upper; @@ -719,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPSTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize PIV */ @@ -948,7 +948,7 @@ f"> */ *info = 1; L170: - return 0; + return; /* End of DPSTF2 */ diff --git a/lapack-netlib/SRC/dpstrf.c b/lapack-netlib/SRC/dpstrf.c index f9449e3aa0..fe83d42325 100644 --- a/lapack-netlib/SRC/dpstrf.c +++ b/lapack-netlib/SRC/dpstrf.c @@ -660,7 +660,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpstrf_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dpstrf_(char *uplo, integer *n, doublereal *a, integer * lda, integer *piv, integer *rank, doublereal *tol, doublereal *work, integer *info) { @@ -671,19 +671,19 @@ f"> */ /* Local variables */ integer i__, j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal dtemp; integer itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal dstop; logical upper; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *), dpstf2_(char *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, @@ -729,13 +729,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPSTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get block size */ @@ -1027,7 +1027,7 @@ f"> */ *info = 1; L200: - return 0; + return; /* End of DPSTRF */ diff --git a/lapack-netlib/SRC/dptcon.c b/lapack-netlib/SRC/dptcon.c index 256d8cb935..f097bdf207 100644 --- a/lapack-netlib/SRC/dptcon.c +++ b/lapack-netlib/SRC/dptcon.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dptcon_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dptcon_(integer *n, doublereal *d__, doublereal *e, doublereal *anorm, doublereal *rcond, doublereal *work, integer *info) { /* System generated locals */ @@ -671,7 +671,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -679,9 +679,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } /* Check that D(1:N) is positive. */ @@ -689,7 +689,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (d__[i__] <= 0.) { - return 0; + return; } /* L10: */ } @@ -730,7 +730,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of DPTCON */ diff --git a/lapack-netlib/SRC/dpteqr.c b/lapack-netlib/SRC/dpteqr.c index eab9d1709a..a1426dcbb9 100644 --- a/lapack-netlib/SRC/dpteqr.c +++ b/lapack-netlib/SRC/dpteqr.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup doublePTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpteqr_(char *compz, integer *n, doublereal *d__, +/* Subroutine */ void dpteqr_(char *compz, integer *n, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -673,14 +673,15 @@ f"> */ integer i__; extern logical lsame_(char *, char *); doublereal vt[1] /* was [1][1] */; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dbdsqr_(char *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dbdsqr_(char *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer icompz; - extern /* Subroutine */ int dpttrf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dpttrf_(integer *, doublereal *, doublereal *, integer *); integer nru; @@ -726,20 +727,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz > 0) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } if (icompz == 2) { dlaset_("Full", n, n, &c_b7, &c_b8, &z__[z_offset], ldz); @@ -749,7 +750,7 @@ f"> */ dpttrf_(n, &d__[1], &e[1], info); if (*info != 0) { - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -785,7 +786,7 @@ f"> */ *info = *n + *info; } - return 0; + return; /* End of DPTEQR */ diff --git a/lapack-netlib/SRC/dptrfs.c b/lapack-netlib/SRC/dptrfs.c index 7dc933efc0..b1077e93c7 100644 --- a/lapack-netlib/SRC/dptrfs.c +++ b/lapack-netlib/SRC/dptrfs.c @@ -676,7 +676,7 @@ f"> */ /* > \ingroup doublePTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dptrfs_(integer *n, integer *nrhs, doublereal *d__, +/* Subroutine */ void dptrfs_(integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublereal *df, doublereal *ef, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *info) @@ -689,7 +689,7 @@ f"> */ doublereal safe1, safe2; integer i__, j; doublereal s; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; doublereal bi; @@ -701,7 +701,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int dpttrs_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dpttrs_(integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); doublereal eps; @@ -746,7 +746,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -758,7 +758,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -937,7 +937,7 @@ f"> */ /* L90: */ } - return 0; + return; /* End of DPTRFS */ diff --git a/lapack-netlib/SRC/dptsv.c b/lapack-netlib/SRC/dptsv.c index eecf96359c..2afc6ea371 100644 --- a/lapack-netlib/SRC/dptsv.c +++ b/lapack-netlib/SRC/dptsv.c @@ -623,14 +623,15 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doublePTsolve */ /* ===================================================================== */ -/* Subroutine */ int dptsv_(integer *n, integer *nrhs, doublereal *d__, +/* Subroutine */ void dptsv_(integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ integer b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpttrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpttrf_( integer *, doublereal *, doublereal *, integer *), dpttrs_( integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); @@ -666,7 +667,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DPTSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the L*D*L**T (or U**T*D*U) factorization of A. */ @@ -678,7 +679,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dpttrs_(n, nrhs, &d__[1], &e[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of DPTSV */ diff --git a/lapack-netlib/SRC/dptsvx.c b/lapack-netlib/SRC/dptsvx.c index 48f980c3f1..9f28238fe1 100644 --- a/lapack-netlib/SRC/dptsvx.c +++ b/lapack-netlib/SRC/dptsvx.c @@ -740,7 +740,7 @@ f"> */ /* > \ingroup doublePTsolve */ /* ===================================================================== */ -/* Subroutine */ int dptsvx_(char *fact, integer *n, integer *nrhs, +/* Subroutine */ void dptsvx_(char *fact, integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublereal *df, doublereal *ef, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal * rcond, doublereal *ferr, doublereal *berr, doublereal *work, integer * @@ -752,15 +752,15 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dptcon_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dptcon_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dptrfs_( integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, @@ -813,7 +813,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -831,7 +831,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -860,7 +860,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of DPTSVX */ diff --git a/lapack-netlib/SRC/dpttrf.c b/lapack-netlib/SRC/dpttrf.c index 609d614a51..f3aa48b269 100644 --- a/lapack-netlib/SRC/dpttrf.c +++ b/lapack-netlib/SRC/dpttrf.c @@ -600,7 +600,7 @@ f"> */ /* > \ingroup doublePTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpttrf_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dpttrf_(integer *n, doublereal *d__, doublereal *e, integer *info) { /* System generated locals */ @@ -633,13 +633,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("DPTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the L*D*L**T (or U**T*D*U) factorization of A. */ @@ -716,7 +716,7 @@ f"> */ } L30: - return 0; + return; /* End of DPTTRF */ diff --git a/lapack-netlib/SRC/dpttrs.c b/lapack-netlib/SRC/dpttrs.c index 42ac11416a..9c49ef9596 100644 --- a/lapack-netlib/SRC/dpttrs.c +++ b/lapack-netlib/SRC/dpttrs.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup doublePTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dpttrs_(integer *n, integer *nrhs, doublereal *d__, +/* Subroutine */ void dpttrs_(integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,8 +631,9 @@ f"> */ /* Local variables */ integer j, jb, nb; - extern /* Subroutine */ int dptts2_(integer *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dptts2_(integer *, integer *, doublereal *, + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -667,13 +668,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DPTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Determine the number of right-hand sides to solve at a time. */ @@ -701,7 +702,7 @@ f"> */ } } - return 0; + return; /* End of DPTTRS */ diff --git a/lapack-netlib/SRC/dptts2.c b/lapack-netlib/SRC/dptts2.c index d8bddfa2b3..e6991be26c 100644 --- a/lapack-netlib/SRC/dptts2.c +++ b/lapack-netlib/SRC/dptts2.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup doublePTcomputational */ /* ===================================================================== */ -/* Subroutine */ int dptts2_(integer *n, integer *nrhs, doublereal *d__, +/* Subroutine */ void dptts2_(integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublereal *b, integer *ldb) { /* System generated locals */ @@ -621,7 +621,7 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); @@ -649,7 +649,7 @@ f"> */ d__1 = 1. / d__[1]; dscal_(nrhs, &d__1, &b[b_offset], ldb); } - return 0; + return; } /* Solve A * X = B using the factorization A = L*D*L**T, */ @@ -677,7 +677,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of DPTTS2 */ diff --git a/lapack-netlib/SRC/drscl.c b/lapack-netlib/SRC/drscl.c index 1dadc5ba32..653d908748 100644 --- a/lapack-netlib/SRC/drscl.c +++ b/lapack-netlib/SRC/drscl.c @@ -593,13 +593,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int drscl_(integer *n, doublereal *sa, doublereal *sx, +/* Subroutine */ void drscl_(integer *n, doublereal *sa, doublereal *sx, integer *incx) { doublereal cden; logical done; doublereal cnum, cden1, cnum1; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); doublereal bignum, smlnum, mul; @@ -621,7 +621,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Function Body */ if (*n <= 0) { - return 0; + return; } /* Get machine parameters */ @@ -668,7 +668,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ goto L10; } - return 0; + return; /* End of DRSCL */ diff --git a/lapack-netlib/SRC/dsb2st_kernels.c b/lapack-netlib/SRC/dsb2st_kernels.c index 061b6c43ed..2d619ce18f 100644 --- a/lapack-netlib/SRC/dsb2st_kernels.c +++ b/lapack-netlib/SRC/dsb2st_kernels.c @@ -680,7 +680,7 @@ kernels.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsb2st_kernels_(char *uplo, logical *wantz, integer * +/* Subroutine */ void dsb2st_kernels_(char *uplo, logical *wantz, integer * ttype, integer *st, integer *ed, integer *sweep, integer *n, integer * nb, integer *ib, doublereal *a, integer *lda, doublereal *v, doublereal *tau, integer *ldvt, doublereal *work) @@ -695,10 +695,10 @@ kernels.f"> */ extern logical lsame_(char *, char *); logical upper; integer j1, j2, lm, ln; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); integer ajeter; - extern /* Subroutine */ int dlarfx_(char *, integer *, integer *, + extern /* Subroutine */ void dlarfx_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *), dlarfy_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer ofdpos, taupos; @@ -896,7 +896,7 @@ kernels.f"> */ } } - return 0; + return; /* END OF DSB2ST_KERNELS */ diff --git a/lapack-netlib/SRC/dsbev.c b/lapack-netlib/SRC/dsbev.c index 05399e6790..50e54cea66 100644 --- a/lapack-netlib/SRC/dsbev.c +++ b/lapack-netlib/SRC/dsbev.c @@ -660,7 +660,7 @@ atrices */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dsbev_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void dsbev_(char *jobz, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -673,7 +673,7 @@ atrices */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -681,7 +681,7 @@ atrices */ logical lower, wantz; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern doublereal dlansb_(char *, char *, integer *, integer *, @@ -689,12 +689,12 @@ atrices */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsbtrd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsterf_( integer *, doublereal *, doublereal *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal smlnum, eps; @@ -742,13 +742,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -760,7 +760,7 @@ atrices */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -821,7 +821,7 @@ atrices */ dscal_(&imax, &d__1, &w[1], &c__1); } - return 0; + return; /* End of DSBEV */ diff --git a/lapack-netlib/SRC/dsbev_2stage.c b/lapack-netlib/SRC/dsbev_2stage.c index 494db1a2e0..563547b6eb 100644 --- a/lapack-netlib/SRC/dsbev_2stage.c +++ b/lapack-netlib/SRC/dsbev_2stage.c @@ -722,7 +722,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsbev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void dsbev_2stage_(char *jobz, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *info) { @@ -737,7 +737,7 @@ stage.f"> */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dsytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void dsytrd_sb2st_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dscal_(integer *, doublereal * @@ -751,7 +751,7 @@ stage.f"> */ integer ib; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern doublereal dlansb_(char *, char *, integer *, integer *, @@ -759,10 +759,10 @@ stage.f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer llwork; doublereal smlnum; @@ -836,15 +836,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -856,7 +856,7 @@ stage.f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -924,7 +924,7 @@ stage.f"> */ work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSBEV_2STAGE */ diff --git a/lapack-netlib/SRC/dsbevd.c b/lapack-netlib/SRC/dsbevd.c index f36eedeced..900bb97185 100644 --- a/lapack-netlib/SRC/dsbevd.c +++ b/lapack-netlib/SRC/dsbevd.c @@ -708,7 +708,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dsbevd_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void dsbevd_(char *jobz, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -720,7 +720,7 @@ f"> */ /* Local variables */ integer inde; doublereal anrm, rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -731,19 +731,19 @@ f"> */ integer indwk2, llwrk2; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern doublereal dlansb_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dstedc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsbtrd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsterf_( integer *, doublereal *, doublereal *, integer *); @@ -823,15 +823,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -839,7 +839,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -902,7 +902,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSBEVD */ diff --git a/lapack-netlib/SRC/dsbevd_2stage.c b/lapack-netlib/SRC/dsbevd_2stage.c index c0df116060..7d9617985a 100644 --- a/lapack-netlib/SRC/dsbevd_2stage.c +++ b/lapack-netlib/SRC/dsbevd_2stage.c @@ -753,7 +753,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsbevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void dsbevd_2stage_(char *jobz, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -767,7 +767,7 @@ static integer c__1 = 1; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); doublereal anrm, rmin, rmax; - extern /* Subroutine */ int dsytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void dsytrd_sb2st_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dscal_(integer *, doublereal * @@ -783,19 +783,19 @@ static integer c__1 = 1; integer indwk2, ib, llwrk2; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern doublereal dlansb_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dstedc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer indwrk, liwmin, llwork; doublereal smlnum; @@ -880,15 +880,15 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DSBEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -896,7 +896,7 @@ static integer c__1 = 1; if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -962,7 +962,7 @@ static integer c__1 = 1; work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSBEVD_2STAGE */ diff --git a/lapack-netlib/SRC/dsbevx.c b/lapack-netlib/SRC/dsbevx.c index 8b77ce15ae..61e687d3eb 100644 --- a/lapack-netlib/SRC/dsbevx.c +++ b/lapack-netlib/SRC/dsbevx.c @@ -779,7 +779,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dsbevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void dsbevx_(char *jobz, char *range, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *q, integer * ldq, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, @@ -798,16 +798,16 @@ f"> */ doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical lower, wantz; @@ -817,30 +817,30 @@ f"> */ integer iscale, indibl; extern doublereal dlansb_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical valeig; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; - extern /* Subroutine */ int dsbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsbtrd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer nsplit; doublereal smlnum, eps, vll, vuu, tmp1; @@ -916,14 +916,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -944,7 +944,7 @@ f"> */ z__[z_dim1 + 1] = 1.; } } - return 0; + return; } /* Get machine constants. */ @@ -1119,7 +1119,7 @@ f"> */ } } - return 0; + return; /* End of DSBEVX */ diff --git a/lapack-netlib/SRC/dsbevx_2stage.c b/lapack-netlib/SRC/dsbevx_2stage.c index f2b17b9082..cea4ec24cc 100644 --- a/lapack-netlib/SRC/dsbevx_2stage.c +++ b/lapack-netlib/SRC/dsbevx_2stage.c @@ -840,7 +840,7 @@ static doublereal c_b45 = 0.; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsbevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void dsbevx_2stage_(char *jobz, char *range, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *q, integer *ldq, doublereal *vl, doublereal *vu, integer *il, integer * iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, @@ -860,22 +860,22 @@ static doublereal c_b45 = 0.; integer imax; doublereal rmin, rmax; logical test; - extern /* Subroutine */ int dsytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void dsytrd_sb2st_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; char order[1]; integer lhtrd; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; @@ -888,27 +888,27 @@ static doublereal c_b45 = 0.; integer iscale, indibl; extern doublereal dlansb_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); logical valeig; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer nsplit, llwork; doublereal smlnum; @@ -1010,16 +1010,16 @@ static doublereal c_b45 = 0.; if (*info != 0) { i__1 = -(*info); xerbla_("DSBEVX_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1040,7 +1040,7 @@ static doublereal c_b45 = 0.; z__[z_dim1 + 1] = 1.; } } - return 0; + return; } /* Get machine constants. */ @@ -1223,7 +1223,7 @@ static doublereal c_b45 = 0.; work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSBEVX_2STAGE */ diff --git a/lapack-netlib/SRC/dsbgst.c b/lapack-netlib/SRC/dsbgst.c index 3ab4e97056..ff5565603d 100644 --- a/lapack-netlib/SRC/dsbgst.c +++ b/lapack-netlib/SRC/dsbgst.c @@ -674,7 +674,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsbgst_(char *vect, char *uplo, integer *n, integer *ka, +/* Subroutine */ void dsbgst_(char *vect, char *uplo, integer *n, integer *ka, integer *kb, doublereal *ab, integer *ldab, doublereal *bb, integer * ldbb, doublereal *x, integer *ldx, doublereal *work, integer *info) { @@ -685,31 +685,33 @@ f"> */ /* Local variables */ integer inca; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), drot_(integer *, doublereal *, integer *, doublereal * , integer *, doublereal *, doublereal *); integer i__, j, k, l, m; doublereal t; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer i0, i1; logical upper; integer i2, j1, j2; logical wantx; - extern /* Subroutine */ int dlar2v_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlar2v_(integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal ra; integer nr, nx; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen), dlargv_( + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlargv_( integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); logical update; - extern /* Subroutine */ int dlartv_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlartv_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ka1, kb1; doublereal ra1; @@ -767,13 +769,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } inca = *ldab * ka1; @@ -1611,14 +1613,14 @@ f"> */ --i__; i0 = m + 1; if (*ka == 0) { - return 0; + return; } goto L490; } } else { i__ -= *ka; if (i__ < 2) { - return 0; + return; } } diff --git a/lapack-netlib/SRC/dsbgv.c b/lapack-netlib/SRC/dsbgv.c index 9a6abb11b9..149689762e 100644 --- a/lapack-netlib/SRC/dsbgv.c +++ b/lapack-netlib/SRC/dsbgv.c @@ -685,7 +685,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dsbgv_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void dsbgv_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, doublereal *ab, integer *ldab, doublereal *bb, integer * ldbb, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *info) @@ -699,7 +699,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); integer iinfo; logical upper, wantz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpbstf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpbstf_( char *, integer *, integer *, doublereal *, integer *, integer *), dsbtrd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsbgst_(char *, char *, @@ -708,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -761,13 +762,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DSBGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -775,7 +776,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -804,7 +805,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dsteqr_(jobz, n, &w[1], &work[inde], &z__[z_offset], ldz, &work[ indwrk], info); } - return 0; + return; /* End of DSBGV */ diff --git a/lapack-netlib/SRC/dsbgvd.c b/lapack-netlib/SRC/dsbgvd.c index ebfe897129..c8274131f1 100644 --- a/lapack-netlib/SRC/dsbgvd.c +++ b/lapack-netlib/SRC/dsbgvd.c @@ -740,7 +740,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int dsbgvd_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void dsbgvd_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, doublereal *ab, integer *ldab, doublereal *bb, integer * ldbb, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -751,17 +751,19 @@ f"> */ /* Local variables */ integer inde; char vect[1]; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iinfo, lwmin; logical upper, wantz; integer indwk2, llwrk2; - extern /* Subroutine */ int dstedc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *), dlacpy_(char *, integer - *, integer *, doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen), dpbstf_(char *, + *, integer *, doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dpbstf_(char *, integer *, integer *, doublereal *, integer *, integer *), dsbtrd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, @@ -850,15 +852,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -866,7 +868,7 @@ f"> */ dpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -903,7 +905,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSBGVD */ diff --git a/lapack-netlib/SRC/dsbgvx.c b/lapack-netlib/SRC/dsbgvx.c index 7b55235dcc..d5422073d4 100644 --- a/lapack-netlib/SRC/dsbgvx.c +++ b/lapack-netlib/SRC/dsbgvx.c @@ -807,7 +807,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int dsbgvx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void dsbgvx_(char *jobz, char *range, char *uplo, integer *n, integer *ka, integer *kb, doublereal *ab, integer *ldab, doublereal * bb, integer *ldbb, doublereal *q, integer *ldq, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer @@ -824,12 +824,12 @@ f"> */ logical test; integer itmp1, i__, j, indee; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical upper, wantz; @@ -837,28 +837,29 @@ f"> */ logical alleig, indeig; integer indibl; logical valeig; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dpbstf_(char *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dpbstf_(char *, integer *, integer *, doublereal *, integer *, integer *), dsbtrd_( char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer indisp; - extern /* Subroutine */ int dsbgst_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsbgst_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *); integer indiwo; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer nsplit; doublereal tmp1; @@ -941,14 +942,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -956,7 +957,7 @@ f"> */ dpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -1082,7 +1083,7 @@ f"> */ } } - return 0; + return; /* End of DSBGVX */ diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f index 55dbce2eef..20de179314 100644 --- a/lapack-netlib/SRC/dsbgvx.f +++ b/lapack-netlib/SRC/dsbgvx.f @@ -317,7 +317,7 @@ SUBROUTINE DSBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, UPPER, VALEIG, WANTZ CHARACTER ORDER, VECT - INTEGER I, IINFO, INDD, INDE, INDEE, INDIBL, INDISP, + INTEGER I, IINFO, INDD, INDE, INDEE, INDISP, $ INDIWO, INDWRK, ITMP1, J, JJ, NSPLIT DOUBLE PRECISION TMP1 * .. @@ -457,17 +457,16 @@ SUBROUTINE DSBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWO = INDISP + N CALL DSTEBZ( RANGE, ORDER, N, VL, VU, IL, IU, ABSTOL, $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( 1 ), IWORK( INDISP ), WORK( INDWRK ), $ IWORK( INDIWO ), INFO ) * IF( WANTZ ) THEN CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) * * Apply transformation matrix used in reduction to tridiagonal @@ -497,11 +496,11 @@ SUBROUTINE DSBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, 40 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/dsbtrd.c b/lapack-netlib/SRC/dsbtrd.c index 89101e3eba..91da67f14a 100644 --- a/lapack-netlib/SRC/dsbtrd.c +++ b/lapack-netlib/SRC/dsbtrd.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsbtrd_(char *vect, char *uplo, integer *n, integer *kd, +/* Subroutine */ void dsbtrd_(char *vect, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *d__, doublereal *e, doublereal *q, integer *ldq, doublereal *work, integer *info) { @@ -688,19 +688,21 @@ f"> */ /* Local variables */ integer inca, jend, lend, jinc, incx, last; doublereal temp; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer j1end, j1inc, i__, j, k, l, iqend; extern logical lsame_(char *, char *); logical initq, wantq, upper; integer i2, j1, j2; - extern /* Subroutine */ int dlar2v_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlar2v_(integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nq, nr, iqaend; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen), dlargv_( + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlargv_( integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *), dlartv_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -756,13 +758,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSBTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize Q to the unit matrix, if needed */ @@ -1277,7 +1279,7 @@ f"> */ } } - return 0; + return; /* End of DSBTRD */ diff --git a/lapack-netlib/SRC/dsfrk.c b/lapack-netlib/SRC/dsfrk.c index d9c21b64d7..d31002e1e4 100644 --- a/lapack-netlib/SRC/dsfrk.c +++ b/lapack-netlib/SRC/dsfrk.c @@ -674,7 +674,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsfrk_(char *transr, char *uplo, char *trans, integer *n, +/* Subroutine */ void dsfrk_(char *transr, char *uplo, char *trans, integer *n, integer *k, doublereal *alpha, doublereal *a, integer *lda, doublereal *beta, doublereal *c__) { @@ -684,13 +684,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ integer info, j; logical normaltransr; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer nrowa; logical lower; - extern /* Subroutine */ int dsyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyrk_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer n1, n2, nk; @@ -743,7 +743,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (info != 0) { i__1 = -info; xerbla_("DSFRK ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ @@ -752,7 +752,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* done (it is in DSYRK for example) and left in the general case. */ if (*n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) { - return 0; + return; } if (*alpha == 0. && *beta == 0.) { @@ -760,7 +760,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { c__[j] = 0.; } - return 0; + return; } /* C is N-by-N. */ @@ -1066,7 +1066,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DSFRK */ diff --git a/lapack-netlib/SRC/dsgesv.c b/lapack-netlib/SRC/dsgesv.c index c170f3d072..049fd25d84 100644 --- a/lapack-netlib/SRC/dsgesv.c +++ b/lapack-netlib/SRC/dsgesv.c @@ -710,7 +710,7 @@ f"> */ /* > \ingroup doubleGEsolve */ /* ===================================================================== */ -/* Subroutine */ int dsgesv_(integer *n, integer *nrhs, doublereal *a, +/* Subroutine */ void dsgesv_(integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer *ldb, doublereal * x, integer *ldx, doublereal *work, real *swork, integer *iter, integer *info) @@ -725,11 +725,11 @@ f"> */ integer ptsa; doublereal rnrm, xnrm; integer ptsx, i__; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iiter; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dlag2s_(integer *, integer *, doublereal *, integer *, real *, integer *, integer *), slag2d_( integer *, integer *, real *, integer *, doublereal *, integer *, @@ -737,13 +737,17 @@ f"> */ extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - dgetrf_(integer *, integer *, doublereal *, integer *, integer *, - integer *), xerbla_(char *, integer *, ftnlen), dgetrs_(char *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int dgetrf_(integer *, integer *, doublereal *, integer *, integer *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern int dgetrs_(char *, integer *, integer *, doublereal *, integer *, integer *, - doublereal *, integer *, integer *), sgetrf_(integer *, - integer *, real *, integer *, integer *, integer *), sgetrs_(char + doublereal *, integer *, integer *); + extern int sgetrf_(integer *, + integer *, real *, integer *, integer *, integer *); + extern int sgetrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); doublereal cte, eps; @@ -799,13 +803,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSGESV", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if (N.EQ.0). */ if (*n == 0) { - return 0; + return; } /* Skip single precision iterative refinement if a priori slower */ @@ -890,7 +894,7 @@ f"> */ /* stopping criterion. We are good to exit. */ *iter = 0; - return 0; + return; L10: @@ -949,7 +953,7 @@ f"> */ *iter = iiter; - return 0; + return; L20: @@ -972,14 +976,14 @@ f"> */ dgetrf_(n, n, &a[a_offset], lda, &ipiv[1], info); if (*info != 0) { - return 0; + return; } dlacpy_("All", n, nrhs, &b[b_offset], ldb, &x[x_offset], ldx); dgetrs_("No transpose", n, nrhs, &a[a_offset], lda, &ipiv[1], &x[x_offset] , ldx, info); - return 0; + return; /* End of DSGESV. */ diff --git a/lapack-netlib/SRC/dspcon.c b/lapack-netlib/SRC/dspcon.c index 9a0588c800..1ae0287878 100644 --- a/lapack-netlib/SRC/dspcon.c +++ b/lapack-netlib/SRC/dspcon.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dspcon_(char *uplo, integer *n, doublereal *ap, integer * +/* Subroutine */ void dspcon_(char *uplo, integer *n, doublereal *ap, integer * ipiv, doublereal *anorm, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -649,12 +649,12 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int dsptrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsptrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -688,7 +688,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -696,9 +696,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -710,7 +710,7 @@ f"> */ ip = *n * (*n + 1) / 2; for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && ap[ip] == 0.) { - return 0; + return; } ip -= i__; /* L10: */ @@ -723,7 +723,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && ap[ip] == 0.) { - return 0; + return; } ip = ip + *n - i__ + 1; /* L20: */ @@ -749,7 +749,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of DSPCON */ diff --git a/lapack-netlib/SRC/dspev.c b/lapack-netlib/SRC/dspev.c index 7d65a7af42..aa0b6bf8c1 100644 --- a/lapack-netlib/SRC/dspev.c +++ b/lapack-netlib/SRC/dspev.c @@ -644,7 +644,7 @@ atrices */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dspev_(char *jobz, char *uplo, integer *n, doublereal * +/* Subroutine */ void dspev_(char *jobz, char *uplo, integer *n, doublereal * ap, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -657,7 +657,7 @@ atrices */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -671,10 +671,10 @@ atrices */ extern doublereal dlansp_(char *, char *, integer *, doublereal *, doublereal *); integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer indwrk; - extern /* Subroutine */ int dopgtr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dopgtr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsptrd_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, @@ -719,13 +719,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -733,7 +733,7 @@ atrices */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -792,7 +792,7 @@ atrices */ dscal_(&imax, &d__1, &w[1], &c__1); } - return 0; + return; /* End of DSPEV */ diff --git a/lapack-netlib/SRC/dspevd.c b/lapack-netlib/SRC/dspevd.c index ebc45ce5f6..e674f18b0d 100644 --- a/lapack-netlib/SRC/dspevd.c +++ b/lapack-netlib/SRC/dspevd.c @@ -691,7 +691,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dspevd_(char *jobz, char *uplo, integer *n, doublereal * +/* Subroutine */ void dspevd_(char *jobz, char *uplo, integer *n, doublereal * ap, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -702,7 +702,7 @@ f"> */ /* Local variables */ integer inde; doublereal anrm, rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -710,7 +710,7 @@ f"> */ logical wantz; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dstedc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *); doublereal safmin; @@ -719,10 +719,10 @@ f"> */ extern doublereal dlansp_(char *, char *, integer *, doublereal *, doublereal *); integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int dsptrd_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsptrd_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dopmtr_(char *, char *, char *, integer *, integer *, doublereal * , doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -796,15 +796,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -812,7 +812,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -871,7 +871,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSPEVD */ diff --git a/lapack-netlib/SRC/dspevx.c b/lapack-netlib/SRC/dspevx.c index 536e5ac92d..92ea4ac6d3 100644 --- a/lapack-netlib/SRC/dspevx.c +++ b/lapack-netlib/SRC/dspevx.c @@ -746,7 +746,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dspevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void dspevx_(char *jobz, char *range, char *uplo, integer *n, doublereal *ap, doublereal *vl, doublereal *vu, integer *il, integer * iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *iwork, integer *ifail, @@ -763,13 +763,13 @@ f"> */ doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantz; @@ -784,17 +784,17 @@ f"> */ extern doublereal dlansp_(char *, char *, integer *, doublereal *, doublereal *); integer indtau, indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int dopgtr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dopgtr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsptrd_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, @@ -864,14 +864,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -887,7 +887,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -1053,7 +1053,7 @@ f"> */ } } - return 0; + return; /* End of DSPEVX */ diff --git a/lapack-netlib/SRC/dspevx.f b/lapack-netlib/SRC/dspevx.f index f56ce298f7..658cb1f527 100644 --- a/lapack-netlib/SRC/dspevx.f +++ b/lapack-netlib/SRC/dspevx.f @@ -255,7 +255,7 @@ SUBROUTINE DSPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, VALEIG, WANTZ CHARACTER ORDER - INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, $ INDISP, INDIWO, INDTAU, INDWRK, ISCALE, ITMP1, $ J, JJ, NSPLIT DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, @@ -424,17 +424,16 @@ SUBROUTINE DSPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWO = INDISP + N CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( 1 ), IWORK( INDISP ), WORK( INDWRK ), $ IWORK( INDIWO ), INFO ) * IF( WANTZ ) THEN CALL DSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) * * Apply orthogonal matrix used in reduction to tridiagonal @@ -471,11 +470,11 @@ SUBROUTINE DSPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, 30 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/dspgst.c b/lapack-netlib/SRC/dspgst.c index 74f3e57748..3fd0c23c17 100644 --- a/lapack-netlib/SRC/dspgst.c +++ b/lapack-netlib/SRC/dspgst.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dspgst_(integer *itype, char *uplo, integer *n, +/* Subroutine */ void dspgst_(integer *itype, char *uplo, integer *n, doublereal *ap, doublereal *bp, integer *info) { /* System generated locals */ @@ -638,19 +638,19 @@ f"> */ /* Local variables */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); - extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *); integer j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; integer j1, k1; - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *), dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); @@ -692,7 +692,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPGST", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -831,7 +831,7 @@ f"> */ } } } - return 0; + return; /* End of DSPGST */ diff --git a/lapack-netlib/SRC/dspgv.c b/lapack-netlib/SRC/dspgv.c index 1f9e29d3fc..be8b6d3229 100644 --- a/lapack-netlib/SRC/dspgv.c +++ b/lapack-netlib/SRC/dspgv.c @@ -672,7 +672,7 @@ static integer c__1 = 1; /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dspgv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void dspgv_(integer *itype, char *jobz, char *uplo, integer * n, doublereal *ap, doublereal *bp, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -682,16 +682,17 @@ static integer c__1 = 1; /* Local variables */ integer neig, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dspev_(char *, char *, integer *, doublereal * + extern /* Subroutine */ void dspev_(char *, char *, integer *, doublereal * , doublereal *, doublereal *, integer *, doublereal *, integer *); char trans[1]; logical upper; - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *), dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); logical wantz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpptrf_( char *, integer *, doublereal *, integer *), dspgst_( integer *, char *, integer *, doublereal *, doublereal *, integer *); @@ -736,13 +737,13 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DSPGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -750,7 +751,7 @@ static integer c__1 = 1; dpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -803,7 +804,7 @@ static integer c__1 = 1; } } } - return 0; + return; /* End of DSPGV */ diff --git a/lapack-netlib/SRC/dspgvd.c b/lapack-netlib/SRC/dspgvd.c index b80830a7b4..3927df7245 100644 --- a/lapack-netlib/SRC/dspgvd.c +++ b/lapack-netlib/SRC/dspgvd.c @@ -722,7 +722,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int dspgvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void dspgvd_(integer *itype, char *jobz, char *uplo, integer * n, doublereal *ap, doublereal *bp, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -737,17 +737,18 @@ f"> */ integer lwmin; char trans[1]; logical upper; - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *), dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); logical wantz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dspevd_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dspevd_( char *, char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *); integer liwmin; - extern /* Subroutine */ int dpptrf_(char *, integer *, doublereal *, + extern /* Subroutine */ void dpptrf_(char *, integer *, doublereal *, integer *), dspgst_(integer *, char *, integer *, doublereal *, doublereal *, integer *); logical lquery; @@ -819,15 +820,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of BP. */ @@ -835,7 +836,7 @@ f"> */ dpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -899,7 +900,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSPGVD */ diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f index 5563263882..df215ae1a7 100644 --- a/lapack-netlib/SRC/dspgvd.f +++ b/lapack-netlib/SRC/dspgvd.f @@ -307,8 +307,8 @@ SUBROUTINE DSPGVD( ITYPE, JOBZ, UPLO, N, AP, BP, W, Z, LDZ, WORK, CALL DSPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL DSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) - LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) - LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) + LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) ) + LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/dspgvx.c b/lapack-netlib/SRC/dspgvx.c index 4d978aca40..b02296c2c8 100644 --- a/lapack-netlib/SRC/dspgvx.c +++ b/lapack-netlib/SRC/dspgvx.c @@ -783,7 +783,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int dspgvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void dspgvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, doublereal *ap, doublereal *bp, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, @@ -797,12 +797,13 @@ f"> */ extern logical lsame_(char *, char *); char trans[1]; logical upper; - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *), dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); logical wantz, alleig, indeig, valeig; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dpptrf_( char *, integer *, doublereal *, integer *), dspgst_( integer *, char *, integer *, doublereal *, doublereal *, integer *), dspevx_(char *, char *, char *, integer *, doublereal @@ -873,14 +874,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -888,7 +889,7 @@ f"> */ dpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -942,7 +943,7 @@ f"> */ } } - return 0; + return; /* End of DSPGVX */ diff --git a/lapack-netlib/SRC/dsposv.c b/lapack-netlib/SRC/dsposv.c index 5ce5794dea..3e0f265336 100644 --- a/lapack-netlib/SRC/dsposv.c +++ b/lapack-netlib/SRC/dsposv.c @@ -713,7 +713,7 @@ f"> */ /* > \ingroup doublePOsolve */ /* ===================================================================== */ -/* Subroutine */ int dsposv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsposv_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * x, integer *ldx, doublereal *work, real *swork, integer *iter, integer *info) @@ -730,7 +730,7 @@ f"> */ integer ptsx, i__; extern logical lsame_(char *, char *); integer iiter; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsymm_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlag2s_(integer *, integer *, doublereal *, @@ -740,14 +740,17 @@ f"> */ integer *, integer *); extern doublereal dlamch_(char *); extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); extern /* Subroutine */ int dpotrf_(char *, integer *, doublereal *, - integer *, integer *), dpotrs_(char *, integer *, integer - *, doublereal *, integer *, doublereal *, integer *, integer *), spotrf_(char *, integer *, real *, integer *, integer *), spotrs_(char *, integer *, integer *, real *, integer *, + integer *, integer *); + extern void dpotrs_(char *, integer *, integer + *, doublereal *, integer *, doublereal *, integer *, integer *); + extern int spotrf_(char *, integer *, real *, integer *, integer *); + extern void spotrs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *); doublereal cte, eps; @@ -803,13 +806,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPOSV", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if (N.EQ.0). */ if (*n == 0) { - return 0; + return; } /* Skip single precision iterative refinement if a priori slower */ @@ -893,7 +896,7 @@ f"> */ /* stopping criterion. We are good to exit. */ *iter = 0; - return 0; + return; L10: @@ -950,7 +953,7 @@ f"> */ *iter = iiter; - return 0; + return; L20: @@ -973,13 +976,13 @@ f"> */ dpotrf_(uplo, n, &a[a_offset], lda, info); if (*info != 0) { - return 0; + return; } dlacpy_("All", n, nrhs, &b[b_offset], ldb, &x[x_offset], ldx); dpotrs_(uplo, n, nrhs, &a[a_offset], lda, &x[x_offset], ldx, info); - return 0; + return; /* End of DSPOSV. */ diff --git a/lapack-netlib/SRC/dsprfs.c b/lapack-netlib/SRC/dsprfs.c index bd16e71f38..520916c6ab 100644 --- a/lapack-netlib/SRC/dsprfs.c +++ b/lapack-netlib/SRC/dsprfs.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsprfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsprfs_(char *uplo, integer *n, integer *nrhs, doublereal *ap, doublereal *afp, integer *ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) @@ -709,15 +709,15 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; - extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer ik, kk; extern doublereal dlamch_(char *); @@ -726,7 +726,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int dsptrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsptrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); doublereal eps; @@ -774,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -989,7 +989,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DSPRFS */ diff --git a/lapack-netlib/SRC/dspsv.c b/lapack-netlib/SRC/dspsv.c index f15ef292f3..c98c5229d8 100644 --- a/lapack-netlib/SRC/dspsv.c +++ b/lapack-netlib/SRC/dspsv.c @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dspsv_(char *uplo, integer *n, integer *nrhs, doublereal +/* Subroutine */ void dspsv_(char *uplo, integer *n, integer *nrhs, doublereal *ap, integer *ipiv, doublereal *b, integer *ldb, integer *info) { /* System generated locals */ @@ -679,7 +679,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dsptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dsptrf_( char *, integer *, doublereal *, integer *, integer *), dsptrs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -717,7 +718,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DSPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -730,7 +731,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dsptrs_(uplo, n, nrhs, &ap[1], &ipiv[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of DSPSV */ diff --git a/lapack-netlib/SRC/dspsvx.c b/lapack-netlib/SRC/dspsvx.c index 7ef98db65f..41bfe98f8d 100644 --- a/lapack-netlib/SRC/dspsvx.c +++ b/lapack-netlib/SRC/dspsvx.c @@ -788,7 +788,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dspsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void dspsvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublereal *ap, doublereal *afp, integer *ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, @@ -800,16 +800,16 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern doublereal dlansp_(char *, char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dspcon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspcon_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dsprfs_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, @@ -866,7 +866,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -881,7 +881,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -911,7 +911,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of DSPSVX */ diff --git a/lapack-netlib/SRC/dsptrd.c b/lapack-netlib/SRC/dsptrd.c index 9e0821bbe0..d613b57990 100644 --- a/lapack-netlib/SRC/dsptrd.c +++ b/lapack-netlib/SRC/dsptrd.c @@ -665,7 +665,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsptrd_(char *uplo, integer *n, doublereal *ap, +/* Subroutine */ void dsptrd_(char *uplo, integer *n, doublereal *ap, doublereal *d__, doublereal *e, doublereal *tau, integer *info) { /* System generated locals */ @@ -675,20 +675,21 @@ f"> */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal taui; - extern /* Subroutine */ int dspr2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *); integer i__; doublereal alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer i1; logical upper; integer ii; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, - integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, + integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); integer i1i1; @@ -720,13 +721,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -828,7 +829,7 @@ f"> */ d__[*n] = ap[ii]; } - return 0; + return; /* End of DSPTRD */ diff --git a/lapack-netlib/SRC/dsptrf.c b/lapack-netlib/SRC/dsptrf.c index 61b6c7491f..6040c539dd 100644 --- a/lapack-netlib/SRC/dsptrf.c +++ b/lapack-netlib/SRC/dsptrf.c @@ -672,7 +672,7 @@ f"> */ /* > J. Lewis, Boeing Computer Services Company */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsptrf_(char *uplo, integer *n, doublereal *ap, integer * +/* Subroutine */ void dsptrf_(char *uplo, integer *n, doublereal *ap, integer * ipiv, integer *info) { /* System generated locals */ @@ -681,14 +681,14 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int dspr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer i__, j, k; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; logical upper; @@ -729,7 +729,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1168,7 +1168,7 @@ f"> */ } L110: - return 0; + return; /* End of DSPTRF */ diff --git a/lapack-netlib/SRC/dsptri.c b/lapack-netlib/SRC/dsptri.c index eef8a578ce..2ecb43c469 100644 --- a/lapack-netlib/SRC/dsptri.c +++ b/lapack-netlib/SRC/dsptri.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsptri_(char *uplo, integer *n, doublereal *ap, integer * +/* Subroutine */ void dsptri_(char *uplo, integer *n, doublereal *ap, integer * ipiv, doublereal *work, integer *info) { /* System generated locals */ @@ -638,11 +638,11 @@ f"> */ integer j, k; doublereal t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; - extern /* Subroutine */ int dspmv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dspmv_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; @@ -680,13 +680,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -698,7 +698,7 @@ f"> */ kp = *n * (*n + 1) / 2; for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && ap[kp] == 0.) { - return 0; + return; } kp -= *info; /* L10: */ @@ -711,7 +711,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && ap[kp] == 0.) { - return 0; + return; } kp = kp + *n - *info + 1; /* L20: */ @@ -952,7 +952,7 @@ f"> */ ; } - return 0; + return; /* End of DSPTRI */ diff --git a/lapack-netlib/SRC/dsptrs.c b/lapack-netlib/SRC/dsptrs.c index 759e14f705..c31a7c7966 100644 --- a/lapack-netlib/SRC/dsptrs.c +++ b/lapack-netlib/SRC/dsptrs.c @@ -630,7 +630,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsptrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsptrs_(char *uplo, integer *n, integer *nrhs, doublereal *ap, integer *ipiv, doublereal *b, integer *ldb, integer * info) { @@ -639,16 +639,16 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal akm1k; integer j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal denom; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); @@ -690,13 +690,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1003,7 +1003,7 @@ f"> */ ; } - return 0; + return; /* End of DSPTRS */ diff --git a/lapack-netlib/SRC/dstebz.c b/lapack-netlib/SRC/dstebz.c index 973b1faf60..608ebe5100 100644 --- a/lapack-netlib/SRC/dstebz.c +++ b/lapack-netlib/SRC/dstebz.c @@ -789,7 +789,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dstebz_(char *range, char *order, integer *n, doublereal +/* Subroutine */ void dstebz_(char *range, char *order, integer *n, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, doublereal *d__, doublereal *e, integer *m, integer *nsplit, doublereal *w, integer *iblock, integer *isplit, doublereal *work, @@ -815,7 +815,7 @@ f"> */ integer ibegin; doublereal gu; integer iw; - extern /* Subroutine */ int dlaebz_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaebz_(integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, @@ -901,7 +901,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEBZ", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize error flags */ @@ -914,7 +914,7 @@ f"> */ *m = 0; if (*n == 0) { - return 0; + return; } /* Simplifications: */ @@ -948,7 +948,7 @@ f"> */ iblock[1] = 1; *m = 1; } - return 0; + return; } /* Compute Splitting Points */ @@ -1061,7 +1061,7 @@ f"> */ if (nwl < 0 || nwl >= *n || nwu < 1 || nwu > *n) { *info = 4; - return 0; + return; } } else { @@ -1364,7 +1364,7 @@ f"> */ if (toofew) { *info += 2; } - return 0; + return; /* End of DSTEBZ */ diff --git a/lapack-netlib/SRC/dstedc.c b/lapack-netlib/SRC/dstedc.c index ef2eeabe89..cd3ba9a665 100644 --- a/lapack-netlib/SRC/dstedc.c +++ b/lapack-netlib/SRC/dstedc.c @@ -705,7 +705,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dstedc_(char *compz, integer *n, doublereal *d__, +/* Subroutine */ void dstedc_(char *compz, integer *n, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -717,19 +717,19 @@ f"> */ doublereal tiny; integer i__, j, k, m; doublereal p; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; - extern /* Subroutine */ int dlaed0_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaed0_(integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer start, ii; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), @@ -740,10 +740,10 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer finish; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dlasrt_(char *, integer *, doublereal *, integer *); integer liwmin, icompz; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal orgnrm; logical lquery; @@ -806,10 +806,10 @@ f"> */ lwmin = *n - 1 << 1; } else { lgn = (integer) (log((doublereal) (*n)) / log(2.)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } if (icompz == 1) { @@ -837,21 +837,21 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEDC", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz != 0) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* If the following conditional clause is removed, then the routine */ @@ -1033,7 +1033,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSTEDC */ diff --git a/lapack-netlib/SRC/dstegr.c b/lapack-netlib/SRC/dstegr.c index 592c8b5971..d64c439ae4 100644 --- a/lapack-netlib/SRC/dstegr.c +++ b/lapack-netlib/SRC/dstegr.c @@ -772,7 +772,7 @@ f"> */ /* > Christof Voemel, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int dstegr_(char *jobz, char *range, integer *n, doublereal * +/* Subroutine */ void dstegr_(char *jobz, char *range, integer *n, doublereal * d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, integer *isuppz, doublereal *work, @@ -782,7 +782,7 @@ f"> */ integer z_dim1, z_offset; /* Local variables */ - extern /* Subroutine */ int dstemr_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstemr_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, integer *, logical *, doublereal *, integer *, integer *, integer @@ -818,6 +818,6 @@ f"> */ /* End of DSTEGR */ - return 0; + return; } /* dstegr_ */ diff --git a/lapack-netlib/SRC/dstein.c b/lapack-netlib/SRC/dstein.c index 22798e355a..a916d8ad17 100644 --- a/lapack-netlib/SRC/dstein.c +++ b/lapack-netlib/SRC/dstein.c @@ -688,7 +688,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dstein_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dstein_(integer *n, doublereal *d__, doublereal *e, integer *m, doublereal *w, integer *iblock, integer *isplit, doublereal *z__, integer *ldz, doublereal *work, integer *iwork, integer *ifail, integer *info) @@ -704,28 +704,29 @@ f"> */ integer jmax; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer iseed[4], gpind, iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer b1; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer j1; doublereal ortol; integer indrv1, indrv2, indrv3, indrv4, indrv5, bn; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlagtf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlagtf_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer * , integer *); doublereal xj; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlagts_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlagts_( integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nrmchk; - extern /* Subroutine */ int dlarnv_(integer *, integer *, integer *, + extern /* Subroutine */ void dlarnv_(integer *, integer *, integer *, doublereal *); integer blksiz; doublereal onenrm, dtpcrt, pertol, scl, eps, sep, nrm, tol; @@ -791,16 +792,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } else if (*n == 1) { z__[z_dim1 + 1] = 1.; - return 0; + return; } /* Get machine constants. */ @@ -1015,7 +1016,7 @@ f"> */ ; } - return 0; + return; /* End of DSTEIN */ diff --git a/lapack-netlib/SRC/dstemr.c b/lapack-netlib/SRC/dstemr.c index fdafc23fe9..42b58bf61a 100644 --- a/lapack-netlib/SRC/dstemr.c +++ b/lapack-netlib/SRC/dstemr.c @@ -833,7 +833,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int dstemr_(char *jobz, char *range, integer *n, doublereal * +/* Subroutine */ void dstemr_(char *jobz, char *range, integer *n, doublereal * d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, integer *m, doublereal *w, doublereal *z__, integer *ldz, integer *nzc, integer *isuppz, logical *tryrac, doublereal *work, @@ -848,24 +848,24 @@ f"> */ doublereal rmin, rmax; integer itmp; doublereal tnrm; - extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer inde2, itmp2; doublereal rtol1, rtol2; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal scale; integer indgp; extern logical lsame_(char *, char *); integer iinfo, iindw, ilast; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; logical wantz; doublereal r1, r2; - extern /* Subroutine */ int dlaev2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlaev2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer jj; @@ -876,7 +876,7 @@ f"> */ integer ibegin, iindbl; doublereal sn, wl; logical valeig; - extern /* Subroutine */ int dlarrc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlarrc_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *), dlarre_(char *, integer *, doublereal *, doublereal *, integer *, integer *, @@ -886,14 +886,15 @@ f"> */ doublereal *, doublereal *, doublereal *, integer *, integer *); integer wbegin; doublereal safmin, wu; - extern /* Subroutine */ int dlarrj_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrj_(integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer inderr, iindwk, indgrs, offset; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dlarrr_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrr_(integer *, doublereal *, doublereal *, integer *), dlarrv_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, @@ -1027,16 +1028,16 @@ f"> */ i__1 = -(*info); xerbla_("DSTEMR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery || zquery) { - return 0; + return; } /* Handle N = 0, 1, and 2 cases immediately */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1054,7 +1055,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } if (*n == 2) { @@ -1202,7 +1203,7 @@ f"> */ work[indgrs], &pivmin, &work[indwrk], &iwork[iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 10; - return 0; + return; } /* Note that if RANGE .NE. 'V', DLARRE computes bounds on the desired */ /* part of the spectrum. All desired eigenvalues are contained in */ @@ -1219,7 +1220,7 @@ f"> */ iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 20; - return 0; + return; } } else { /* DLARRE computes eigenvalues of the (shifted) root representation */ @@ -1288,7 +1289,7 @@ f"> */ dlasrt_("I", m, &w[1], &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } } else { i__1 = *m - 1; @@ -1325,7 +1326,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSTEMR */ diff --git a/lapack-netlib/SRC/dsteqr.c b/lapack-netlib/SRC/dsteqr.c index 95c0524cc1..d1140bea61 100644 --- a/lapack-netlib/SRC/dsteqr.c +++ b/lapack-netlib/SRC/dsteqr.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsteqr_(char *compz, integer *n, doublereal *d__, +/* Subroutine */ void dsteqr_(char *compz, integer *n, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -658,19 +658,19 @@ f"> */ /* Local variables */ integer lend, jtot; - extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal b, c__, f, g; integer i__, j, k, l, m; doublereal p, r__, s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal anorm; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer l1; - extern /* Subroutine */ int dlaev2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlaev2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer lendm1, lendp1; @@ -678,17 +678,17 @@ f"> */ integer ii; extern doublereal dlamch_(char *); integer mm, iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); doublereal safmin; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlasrt_(char *, integer *, doublereal *, integer *); integer lendsv; doublereal ssfmin; @@ -741,20 +741,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz == 2) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Determine the unit roundoff and over/underflow thresholds. */ @@ -1168,7 +1168,7 @@ f"> */ } L190: - return 0; + return; /* End of DSTEQR */ diff --git a/lapack-netlib/SRC/dsterf.c b/lapack-netlib/SRC/dsterf.c index 168d3c940d..090c50b61c 100644 --- a/lapack-netlib/SRC/dsterf.c +++ b/lapack-netlib/SRC/dsterf.c @@ -601,7 +601,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsterf_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void dsterf_(integer *n, doublereal *d__, doublereal *e, integer *info) { /* System generated locals */ @@ -613,7 +613,7 @@ f"> */ integer lend; doublereal rmax; integer jtot; - extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal c__; integer i__, l, m; @@ -623,14 +623,14 @@ f"> */ doublereal bb; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal oldgam, safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal safmax; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlasrt_(char *, integer *, doublereal *, integer *); integer lendsv; doublereal ssfmin; @@ -664,10 +664,10 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("DSTERF", &i__1, (ftnlen)6); - return 0; + return; } if (*n <= 1) { - return 0; + return; } /* Determine the unit roundoff for this environment. */ @@ -998,7 +998,7 @@ f"> */ dlasrt_("I", n, &d__[1], info); L180: - return 0; + return; /* End of DSTERF */ diff --git a/lapack-netlib/SRC/dstev.c b/lapack-netlib/SRC/dstev.c index e1dfae0268..00ab2cfe0b 100644 --- a/lapack-netlib/SRC/dstev.c +++ b/lapack-netlib/SRC/dstev.c @@ -630,7 +630,7 @@ atrices */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dstev_(char *jobz, integer *n, doublereal *d__, +/* Subroutine */ void dstev_(char *jobz, integer *n, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz, doublereal *work, integer *info) { @@ -641,7 +641,7 @@ atrices */ /* Local variables */ integer imax; doublereal rmin, rmax, tnrm; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -652,7 +652,7 @@ atrices */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal * , doublereal *, integer *, doublereal *, integer *); doublereal smlnum, eps; @@ -692,20 +692,20 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -755,7 +755,7 @@ atrices */ dscal_(&imax, &d__1, &d__[1], &c__1); } - return 0; + return; /* End of DSTEV */ diff --git a/lapack-netlib/SRC/dstevd.c b/lapack-netlib/SRC/dstevd.c index efd4540a49..cf2f30eacc 100644 --- a/lapack-netlib/SRC/dstevd.c +++ b/lapack-netlib/SRC/dstevd.c @@ -676,7 +676,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dstevd_(char *jobz, integer *n, doublereal *d__, +/* Subroutine */ void dstevd_(char *jobz, integer *n, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -686,7 +686,7 @@ f"> */ /* Local variables */ doublereal rmin, rmax, tnrm; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -694,14 +694,14 @@ f"> */ logical wantz; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dstedc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer liwmin; doublereal smlnum; @@ -765,22 +765,22 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -829,7 +829,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSTEVD */ diff --git a/lapack-netlib/SRC/dstevr.c b/lapack-netlib/SRC/dstevr.c index 5e1eacdd26..e5fbbdae43 100644 --- a/lapack-netlib/SRC/dstevr.c +++ b/lapack-netlib/SRC/dstevr.c @@ -820,7 +820,7 @@ f"> */ /* > California at Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dstevr_(char *jobz, char *range, integer *n, doublereal * +/* Subroutine */ void dstevr_(char *jobz, char *range, integer *n, doublereal * d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, integer *isuppz, doublereal *work, @@ -836,12 +836,12 @@ f"> */ logical test; doublereal tnrm; integer itmp1, i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; @@ -858,12 +858,12 @@ f"> */ doublereal bignum; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); integer indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -961,16 +961,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEVR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -986,7 +986,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -1145,7 +1145,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSTEVR */ diff --git a/lapack-netlib/SRC/dstevx.c b/lapack-netlib/SRC/dstevx.c index 413be6220c..1febd23eb7 100644 --- a/lapack-netlib/SRC/dstevx.c +++ b/lapack-netlib/SRC/dstevx.c @@ -740,7 +740,7 @@ f"> */ /* > \ingroup doubleOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int dstevx_(char *jobz, char *range, integer *n, doublereal * +/* Subroutine */ void dstevx_(char *jobz, char *range, integer *n, doublereal * d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *iwork, @@ -756,12 +756,12 @@ f"> */ logical test; doublereal tnrm; integer itmp1, i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantz; @@ -775,17 +775,17 @@ f"> */ doublereal bignum; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); integer indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer nsplit; doublereal smlnum, eps, vll, vuu, tmp1; @@ -848,14 +848,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSTEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -871,7 +871,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -1017,7 +1017,7 @@ f"> */ } } - return 0; + return; /* End of DSTEVX */ diff --git a/lapack-netlib/SRC/dstevx.f b/lapack-netlib/SRC/dstevx.f index f93c1d3b75..e0cd07da05 100644 --- a/lapack-netlib/SRC/dstevx.f +++ b/lapack-netlib/SRC/dstevx.f @@ -248,7 +248,7 @@ SUBROUTINE DSTEVX( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, ABSTOL, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, VALEIG, WANTZ CHARACTER ORDER - INTEGER I, IMAX, INDIBL, INDISP, INDIWO, INDWRK, + INTEGER I, IMAX, INDISP, INDIWO, INDWRK, $ ISCALE, ITMP1, J, JJ, NSPLIT DOUBLE PRECISION BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, SMLNUM, $ TMP1, TNRM, VLL, VUU @@ -399,15 +399,14 @@ SUBROUTINE DSTEVX( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, ABSTOL, ORDER = 'E' END IF INDWRK = 1 - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWO = INDISP + N CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTOL, D, E, M, - $ NSPLIT, W, IWORK( INDIBL ), IWORK( INDISP ), + $ NSPLIT, W, IWORK( 1 ), IWORK( INDISP ), $ WORK( INDWRK ), IWORK( INDIWO ), INFO ) * IF( WANTZ ) THEN - CALL DSTEIN( N, D, E, M, W, IWORK( INDIBL ), IWORK( INDISP ), + CALL DSTEIN( N, D, E, M, W, IWORK( 1 ), IWORK( INDISP ), $ Z, LDZ, WORK( INDWRK ), IWORK( INDIWO ), IFAIL, $ INFO ) END IF @@ -439,11 +438,11 @@ SUBROUTINE DSTEVX( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, ABSTOL, 30 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL DSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/dsycon.c b/lapack-netlib/SRC/dsycon.c index c9554631e1..751cd076d4 100644 --- a/lapack-netlib/SRC/dsycon.c +++ b/lapack-netlib/SRC/dsycon.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsycon_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsycon_(char *uplo, integer *n, doublereal *a, integer * lda, integer *ipiv, doublereal *anorm, doublereal *rcond, doublereal * work, integer *iwork, integer *info) { @@ -654,11 +654,11 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int dsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); @@ -697,7 +697,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -705,9 +705,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -718,7 +718,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -729,7 +729,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.) { - return 0; + return; } /* L20: */ } @@ -755,7 +755,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of DSYCON */ diff --git a/lapack-netlib/SRC/dsycon_3.c b/lapack-netlib/SRC/dsycon_3.c index 4aff37b54a..9f9cb0039d 100644 --- a/lapack-netlib/SRC/dsycon_3.c +++ b/lapack-netlib/SRC/dsycon_3.c @@ -683,7 +683,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsycon_3_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsycon_3_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *anorm, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -692,16 +692,16 @@ static integer c__1 = 1; /* Local variables */ integer kase; - extern /* Subroutine */ int dsytrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_3_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; @@ -740,7 +740,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYCON_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ @@ -748,9 +748,9 @@ static integer c__1 = 1; *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -761,7 +761,7 @@ static integer c__1 = 1; for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.) { - return 0; + return; } } } else { @@ -771,7 +771,7 @@ static integer c__1 = 1; i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.) { - return 0; + return; } } } @@ -796,7 +796,7 @@ static integer c__1 = 1; *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of DSYCON_3 */ diff --git a/lapack-netlib/SRC/dsycon_rook.c b/lapack-netlib/SRC/dsycon_rook.c index 5d7da8bdc7..c339fc39c2 100644 --- a/lapack-netlib/SRC/dsycon_rook.c +++ b/lapack-netlib/SRC/dsycon_rook.c @@ -656,7 +656,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsycon_rook_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsycon_rook_(char *uplo, integer *n, doublereal *a, integer *lda, integer *ipiv, doublereal *anorm, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -664,16 +664,16 @@ rook.f"> */ integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dsytrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_rook_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer kase, i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, - integer *, doublereal *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, + integer *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; @@ -711,7 +711,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYCON_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ @@ -719,9 +719,9 @@ rook.f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -732,7 +732,7 @@ rook.f"> */ for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -743,7 +743,7 @@ rook.f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.) { - return 0; + return; } /* L20: */ } @@ -769,7 +769,7 @@ rook.f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of DSYCON_ROOK */ diff --git a/lapack-netlib/SRC/dsyconv.c b/lapack-netlib/SRC/dsyconv.c index 38a830639d..192321ef64 100644 --- a/lapack-netlib/SRC/dsyconv.c +++ b/lapack-netlib/SRC/dsyconv.c @@ -623,7 +623,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsyconv_(char *uplo, char *way, integer *n, doublereal * +/* Subroutine */ void dsyconv_(char *uplo, char *way, integer *n, doublereal * a, integer *lda, integer *ipiv, doublereal *e, integer *info) { /* System generated locals */ @@ -672,13 +672,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DSYCONV", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -880,7 +880,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } } - return 0; + return; /* End of DSYCONV */ diff --git a/lapack-netlib/SRC/dsyconvf.c b/lapack-netlib/SRC/dsyconvf.c index d3a5898708..3947a6b56a 100644 --- a/lapack-netlib/SRC/dsyconvf.c +++ b/lapack-netlib/SRC/dsyconvf.c @@ -715,7 +715,7 @@ f.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsyconvf_(char *uplo, char *way, integer *n, doublereal * +/* Subroutine */ void dsyconvf_(char *uplo, char *way, integer *n, doublereal * a, integer *lda, doublereal *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -724,7 +724,7 @@ f.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical upper; integer ip; @@ -765,13 +765,13 @@ f.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYCONVF", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1072,7 +1072,7 @@ f.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of DSYCONVF */ diff --git a/lapack-netlib/SRC/dsyconvf_rook.c b/lapack-netlib/SRC/dsyconvf_rook.c index 5c546aeb3d..49b4fcc190 100644 --- a/lapack-netlib/SRC/dsyconvf_rook.c +++ b/lapack-netlib/SRC/dsyconvf_rook.c @@ -706,7 +706,7 @@ f_rook.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsyconvf_rook_(char *uplo, char *way, integer *n, +/* Subroutine */ void dsyconvf_rook_(char *uplo, char *way, integer *n, doublereal *a, integer *lda, doublereal *e, integer *ipiv, integer * info) { @@ -716,7 +716,7 @@ f_rook.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical upper; integer ip; @@ -758,13 +758,13 @@ f_rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYCONVF_ROOK", &i__1, (ftnlen)13); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1063,7 +1063,7 @@ f_rook.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of DSYCONVF_ROOK */ diff --git a/lapack-netlib/SRC/dsyequb.c b/lapack-netlib/SRC/dsyequb.c index a9d8d00cc2..84cb498869 100644 --- a/lapack-netlib/SRC/dsyequb.c +++ b/lapack-netlib/SRC/dsyequb.c @@ -644,7 +644,7 @@ static integer c__1 = 1; /* > Tech report version: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.1679 */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsyequb_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsyequb_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *s, doublereal *scond, doublereal *amax, doublereal * work, integer *info) { @@ -665,7 +665,7 @@ static integer c__1 = 1; logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal smlnum, avg, std, tol; @@ -700,7 +700,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYEQUB", &i__1, (ftnlen)7); - return 0; + return; } up = lsame_(uplo, "U"); *amax = 0.; @@ -709,7 +709,7 @@ static integer c__1 = 1; if (*n == 0) { *scond = 1.; - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -826,7 +826,7 @@ static integer c__1 = 1; d__ = c1 * c1 - c0 * 4 * c2; if (d__ <= 0.) { *info = -1; - return 0; + return; } si = c0 * -2 / (c1 + sqrt(d__)); d__ = si - s[i__]; @@ -883,6 +883,6 @@ static integer c__1 = 1; } *scond = f2cmax(smin,smlnum) / f2cmin(smax,bignum); - return 0; + return; } /* dsyequb_ */ diff --git a/lapack-netlib/SRC/dsyev.c b/lapack-netlib/SRC/dsyev.c index 3c0a8c2423..50801fc310 100644 --- a/lapack-netlib/SRC/dsyev.c +++ b/lapack-netlib/SRC/dsyev.c @@ -649,7 +649,7 @@ ices */ /* > \ingroup doubleSYeigen */ /* ===================================================================== */ -/* Subroutine */ int dsyev_(char *jobz, char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsyev_(char *jobz, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *w, doublereal *work, integer *lwork, integer *info) { @@ -662,7 +662,7 @@ ices */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -671,7 +671,7 @@ ices */ integer nb; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal safmin; @@ -680,12 +680,12 @@ ices */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); integer indwrk; - extern /* Subroutine */ int dorgtr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dorgtr_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsytrd_(char *, integer *, doublereal *, integer *, doublereal *, @@ -749,15 +749,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -766,7 +766,7 @@ ices */ if (wantz) { a[a_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -831,7 +831,7 @@ ices */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYEV */ diff --git a/lapack-netlib/SRC/dsyev_2stage.c b/lapack-netlib/SRC/dsyev_2stage.c index a9fc841c9d..0b9f7283c4 100644 --- a/lapack-netlib/SRC/dsyev_2stage.c +++ b/lapack-netlib/SRC/dsyev_2stage.c @@ -702,7 +702,7 @@ SY matrices */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsyev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void dsyev_2stage_(char *jobz, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *w, doublereal *work, integer *lwork, integer *info) { @@ -716,12 +716,12 @@ SY matrices */ integer *, integer *, integer *); integer imax; doublereal anrm, rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dsytrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void dsytrd_2stage_(char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lhtrd, lwmin; @@ -731,19 +731,19 @@ SY matrices */ integer ib, kd; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); integer indwrk; - extern /* Subroutine */ int dorgtr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dorgtr_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer llwork; @@ -807,15 +807,15 @@ SY matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -824,7 +824,7 @@ SY matrices */ if (wantz) { a[a_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -872,7 +872,7 @@ SY matrices */ } else { /* Not available in this release, and argument checking should not */ /* let it getting here */ - return 0; + return; dorgtr_(uplo, n, &a[a_offset], lda, &work[indtau], &work[indwrk], & llwork, &iinfo); dsteqr_(jobz, n, &w[1], &work[inde], &a[a_offset], lda, &work[indtau], @@ -895,7 +895,7 @@ SY matrices */ work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSYEV_2STAGE */ diff --git a/lapack-netlib/SRC/dsyevd.c b/lapack-netlib/SRC/dsyevd.c index 7e1810f1d9..16f71d9b63 100644 --- a/lapack-netlib/SRC/dsyevd.c +++ b/lapack-netlib/SRC/dsyevd.c @@ -699,7 +699,7 @@ f"> */ /* > Modified description of INFO. Sven, 16 Feb 05. \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsyevd_(char *jobz, char *uplo, integer *n, doublereal * +/* Subroutine */ void dsyevd_(char *jobz, char *uplo, integer *n, doublereal * a, integer *lda, doublereal *w, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -711,7 +711,7 @@ f"> */ integer inde; doublereal anrm, rmin, rmax; integer lopt; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -720,7 +720,7 @@ f"> */ integer indwk2, llwrk2; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, @@ -733,12 +733,12 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); integer indwrk, liwmin; - extern /* Subroutine */ int dormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormtr_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *), dsytrd_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, @@ -820,15 +820,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -836,7 +836,7 @@ f"> */ if (wantz) { a[a_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -901,7 +901,7 @@ f"> */ work[1] = (doublereal) lopt; iwork[1] = liopt; - return 0; + return; /* End of DSYEVD */ diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f index edbe896fe8..eaaecd8d98 100644 --- a/lapack-netlib/SRC/dsyevd.f +++ b/lapack-netlib/SRC/dsyevd.f @@ -257,7 +257,7 @@ SUBROUTINE DSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LWMIN = 2*N + 1 END IF LOPT = MAX( LWMIN, 2*N + - $ ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF WORK( 1 ) = LOPT diff --git a/lapack-netlib/SRC/dsyevd_2stage.c b/lapack-netlib/SRC/dsyevd_2stage.c index ba5cdb7fc9..a2fdcc74d0 100644 --- a/lapack-netlib/SRC/dsyevd_2stage.c +++ b/lapack-netlib/SRC/dsyevd_2stage.c @@ -746,7 +746,7 @@ static doublereal c_b27 = 1.; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsyevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void dsyevd_2stage_(char *jobz, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *w, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -759,12 +759,12 @@ static doublereal c_b27 = 1.; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); doublereal anrm, rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dsytrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void dsytrd_2stage_(char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lhtrd, lwmin; @@ -774,7 +774,7 @@ static doublereal c_b27 = 1.; integer indwk2, ib, llwrk2, kd; extern doublereal dlamch_(char *); integer iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, @@ -785,12 +785,12 @@ static doublereal c_b27 = 1.; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); integer indwrk, liwmin; - extern /* Subroutine */ int dormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormtr_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer llwork; @@ -873,15 +873,15 @@ static doublereal c_b27 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("DSYEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -889,7 +889,7 @@ static doublereal c_b27 = 1.; if (wantz) { a[a_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -941,7 +941,7 @@ static doublereal c_b27 = 1.; } else { /* Not available in this release, and argument checking should not */ /* let it getting here */ - return 0; + return; dstedc_("I", n, &w[1], &work[inde], &work[indwrk], n, &work[indwk2], & llwrk2, &iwork[1], liwork, info); dormtr_("L", uplo, "N", n, n, &a[a_offset], lda, &work[indtau], &work[ @@ -959,7 +959,7 @@ static doublereal c_b27 = 1.; work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSYEVD_2STAGE */ diff --git a/lapack-netlib/SRC/dsyevr.c b/lapack-netlib/SRC/dsyevr.c index a15b9d195e..6f55b9f42e 100644 --- a/lapack-netlib/SRC/dsyevr.c +++ b/lapack-netlib/SRC/dsyevr.c @@ -851,7 +851,7 @@ f"> */ /* > California at Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsyevr_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void dsyevr_(char *jobz, char *range, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *vl, doublereal *vu, integer * il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, integer *isuppz, doublereal *work, @@ -867,14 +867,14 @@ f"> */ integer imax; doublereal rmin, rmax; integer i__, j, inddd, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; integer indwk; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; @@ -890,14 +890,14 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo, indwkn; extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -908,12 +908,12 @@ f"> */ *); integer liwmin; logical tryrac; - extern /* Subroutine */ int dormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormtr_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer llwrkn, llwork, nsplit; doublereal smlnum; - extern /* Subroutine */ int dsytrd_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytrd_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer lwkopt; @@ -1014,9 +1014,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYEVR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1024,7 +1024,7 @@ f"> */ *m = 0; if (*n == 0) { work[1] = 1.; - return 0; + return; } if (*n == 1) { @@ -1043,7 +1043,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1255,7 +1255,7 @@ f"> */ work[1] = (doublereal) lwkopt; iwork[1] = liwmin; - return 0; + return; /* End of DSYEVR */ diff --git a/lapack-netlib/SRC/dsyevr_2stage.c b/lapack-netlib/SRC/dsyevr_2stage.c index 51e0ad3c77..a706e53f6c 100644 --- a/lapack-netlib/SRC/dsyevr_2stage.c +++ b/lapack-netlib/SRC/dsyevr_2stage.c @@ -898,7 +898,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsyevr_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void dsyevr_2stage_(char *jobz, char *range, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *vl, doublereal * vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, integer *isuppz, @@ -917,14 +917,14 @@ static integer c_n1 = -1; integer imax; doublereal rmin, rmax; integer i__, j, inddd, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; integer indwk, lhtrd; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dsytrd_2stage_(char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -945,14 +945,14 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo, indwkn; extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), @@ -963,7 +963,7 @@ static integer c_n1 = -1; *); integer liwmin; logical tryrac; - extern /* Subroutine */ int dormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void dormtr_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer llwrkn, llwork, nsplit; @@ -1066,9 +1066,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYEVR_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1076,7 +1076,7 @@ static integer c_n1 = -1; *m = 0; if (*n == 0) { work[1] = 1.; - return 0; + return; } if (*n == 1) { @@ -1095,7 +1095,7 @@ static integer c_n1 = -1; isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1311,7 +1311,7 @@ static integer c_n1 = -1; work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DSYEVR_2STAGE */ diff --git a/lapack-netlib/SRC/dsyevx.c b/lapack-netlib/SRC/dsyevx.c index 0ef0796464..83927ddec3 100644 --- a/lapack-netlib/SRC/dsyevx.c +++ b/lapack-netlib/SRC/dsyevx.c @@ -766,7 +766,7 @@ f"> */ /* > \ingroup doubleSYeigen */ /* ===================================================================== */ -/* Subroutine */ int dsyevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void dsyevx_(char *jobz, char *range, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *vl, doublereal *vu, integer * il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, integer *lwork, @@ -783,13 +783,13 @@ f"> */ doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical lower, wantz; @@ -798,7 +798,7 @@ f"> */ logical alleig, indeig; integer iscale, indibl; logical valeig; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, @@ -806,19 +806,19 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo, indwkn; extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk, lwkmin; - extern /* Subroutine */ int dorgtr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dorgtr_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dormtr_(char *, char *, char *, integer *, integer *, doublereal * @@ -826,7 +826,7 @@ f"> */ integer *, integer *); integer llwrkn, llwork, nsplit; doublereal smlnum; - extern /* Subroutine */ int dsytrd_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytrd_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *); integer lwkopt; @@ -921,16 +921,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -946,7 +946,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -1130,7 +1130,7 @@ f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYEVX */ diff --git a/lapack-netlib/SRC/dsyevx_2stage.c b/lapack-netlib/SRC/dsyevx_2stage.c index bf58bc6ca5..b73bdddfe7 100644 --- a/lapack-netlib/SRC/dsyevx_2stage.c +++ b/lapack-netlib/SRC/dsyevx_2stage.c @@ -816,7 +816,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsyevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void dsyevx_2stage_(char *jobz, char *range, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *vl, doublereal * vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, integer *ldz, doublereal *work, @@ -835,17 +835,17 @@ static integer c__4 = 4; doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dsytrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void dsytrd_2stage_(char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer lhtrd; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; @@ -857,25 +857,25 @@ static integer c__4 = 4; logical alleig, indeig; integer iscale, indibl; logical valeig; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int dstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *), dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo, indwkn; extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int dorgtr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dorgtr_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), dormtr_(char *, char *, char *, integer *, integer *, doublereal * @@ -978,16 +978,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("DSYEVX_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1003,7 +1003,7 @@ static integer c__4 = 4; if (wantz) { z__[z_dim1 + 1] = 1.; } - return 0; + return; } /* Get machine constants. */ @@ -1190,7 +1190,7 @@ static integer c__4 = 4; work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSYEVX_2STAGE */ diff --git a/lapack-netlib/SRC/dsygs2.c b/lapack-netlib/SRC/dsygs2.c index 4d7c07bc89..da9ba7b8e8 100644 --- a/lapack-netlib/SRC/dsygs2.c +++ b/lapack-netlib/SRC/dsygs2.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsygs2_(integer *itype, char *uplo, integer *n, +/* Subroutine */ void dsygs2_(integer *itype, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { @@ -652,17 +652,17 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal ct; @@ -706,7 +706,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYGS2", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -848,7 +848,7 @@ f"> */ } } } - return 0; + return; /* End of DSYGS2 */ diff --git a/lapack-netlib/SRC/dsygst.c b/lapack-netlib/SRC/dsygst.c index 49bb202cf4..b18805b890 100644 --- a/lapack-netlib/SRC/dsygst.c +++ b/lapack-netlib/SRC/dsygst.c @@ -645,7 +645,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsygst_(integer *itype, char *uplo, integer *n, +/* Subroutine */ void dsygst_(integer *itype, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer * info) { @@ -655,20 +655,20 @@ f"> */ /* Local variables */ integer k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsymm_( char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dsygs2_( integer *, char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer kb; - extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyr2k_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nb; @@ -713,13 +713,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -897,7 +897,7 @@ f"> */ } } } - return 0; + return; /* End of DSYGST */ diff --git a/lapack-netlib/SRC/dsygv.c b/lapack-netlib/SRC/dsygv.c index 22de5e4757..21792c93c2 100644 --- a/lapack-netlib/SRC/dsygv.c +++ b/lapack-netlib/SRC/dsygv.c @@ -689,7 +689,7 @@ static doublereal c_b16 = 1.; /* > \ingroup doubleSYeigen */ /* ===================================================================== */ -/* Subroutine */ int dsygv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void dsygv_(integer *itype, char *jobz, char *uplo, integer * n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *w, doublereal *work, integer *lwork, integer *info) { @@ -699,15 +699,15 @@ static doublereal c_b16 = 1.; /* Local variables */ integer neig; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); char trans[1]; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dsyev_(char *, char *, integer *, doublereal * + extern /* Subroutine */ void dsyev_(char *, char *, integer *, doublereal * , integer *, doublereal *, doublereal *, integer *, integer *); logical wantz; integer nb; @@ -717,7 +717,7 @@ static doublereal c_b16 = 1.; extern /* Subroutine */ int dpotrf_(char *, integer *, doublereal *, integer *, integer *); integer lwkmin; - extern /* Subroutine */ int dsygst_(integer *, char *, integer *, + extern /* Subroutine */ void dsygst_(integer *, char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; logical lquery; @@ -783,15 +783,15 @@ static doublereal c_b16 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("DSYGV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -799,7 +799,7 @@ static doublereal c_b16 = 1.; dpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -846,7 +846,7 @@ static doublereal c_b16 = 1.; } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYGV */ diff --git a/lapack-netlib/SRC/dsygv_2stage.c b/lapack-netlib/SRC/dsygv_2stage.c index 341534833f..92e12102ec 100644 --- a/lapack-netlib/SRC/dsygv_2stage.c +++ b/lapack-netlib/SRC/dsygv_2stage.c @@ -743,7 +743,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsygv_2stage_(integer *itype, char *jobz, char *uplo, +/* Subroutine */ void dsygv_2stage_(integer *itype, char *jobz, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *w, doublereal *work, integer *lwork, integer *info) { @@ -754,26 +754,27 @@ stage.f"> */ integer neig; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); - extern /* Subroutine */ int dsyev_2stage_(char *, char *, integer *, + extern /* Subroutine */ void dsyev_2stage_(char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); extern logical lsame_(char *, char *); integer lhtrd; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer lwmin; char trans[1]; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; integer lwtrd; logical wantz; integer ib, kd; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpotrf_( - char *, integer *, doublereal *, integer *, integer *), - dsygst_(integer *, char *, integer *, doublereal *, integer *, + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dpotrf_( + char *, integer *, doublereal *, integer *, integer *); + extern void dsygst_(integer *, char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); logical lquery; @@ -839,15 +840,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYGV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -855,7 +856,7 @@ stage.f"> */ dpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -903,7 +904,7 @@ stage.f"> */ } work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSYGV_2STAGE */ diff --git a/lapack-netlib/SRC/dsygvd.c b/lapack-netlib/SRC/dsygvd.c index d839ccb683..828399c971 100644 --- a/lapack-netlib/SRC/dsygvd.c +++ b/lapack-netlib/SRC/dsygvd.c @@ -739,7 +739,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsygvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void dsygvd_(integer *itype, char *jobz, char *uplo, integer * n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *w, doublereal *work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -751,20 +751,21 @@ f"> */ /* Local variables */ integer lopt; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer lwmin; char trans[1]; integer liopt; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper, wantz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dpotrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dpotrf_( char *, integer *, doublereal *, integer *, integer *); integer liwmin; - extern /* Subroutine */ int dsyevd_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dsyevd_(char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, integer *, integer *), dsygst_(integer *, char *, integer *, doublereal *, integer *, doublereal *, integer *, @@ -842,15 +843,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -858,7 +859,7 @@ f"> */ dpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -910,7 +911,7 @@ f"> */ work[1] = (doublereal) lopt; iwork[1] = liopt; - return 0; + return; /* End of DSYGVD */ diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f index 61134bedce..3b38665a75 100644 --- a/lapack-netlib/SRC/dsygvd.f +++ b/lapack-netlib/SRC/dsygvd.f @@ -330,8 +330,8 @@ SUBROUTINE DSYGVD( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL DSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK, $ INFO ) - LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) - LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) + LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) ) + LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/dsygvx.c b/lapack-netlib/SRC/dsygvx.c index fdb5ee8a54..e5478eea79 100644 --- a/lapack-netlib/SRC/dsygvx.c +++ b/lapack-netlib/SRC/dsygvx.c @@ -810,7 +810,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int dsygvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void dsygvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublereal *z__, @@ -822,11 +822,11 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); char trans[1]; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper, wantz; @@ -838,11 +838,11 @@ f"> */ extern /* Subroutine */ int dpotrf_(char *, integer *, doublereal *, integer *, integer *); integer lwkmin; - extern /* Subroutine */ int dsygst_(integer *, char *, integer *, + extern /* Subroutine */ void dsygst_(integer *, char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int dsyevx_(char *, char *, char *, integer *, + extern /* Subroutine */ void dsyevx_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer @@ -936,16 +936,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYGVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -953,7 +953,7 @@ f"> */ dpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -1004,7 +1004,7 @@ f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYGVX */ diff --git a/lapack-netlib/SRC/dsyrfs.c b/lapack-netlib/SRC/dsyrfs.c index af90ab7226..2503d5f241 100644 --- a/lapack-netlib/SRC/dsyrfs.c +++ b/lapack-netlib/SRC/dsyrfs.c @@ -705,7 +705,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsyrfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsyrfs_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer * ipiv, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, @@ -723,12 +723,12 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; logical upper; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -738,7 +738,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int dsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal eps; @@ -795,7 +795,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -807,7 +807,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1004,7 +1004,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of DSYRFS */ diff --git a/lapack-netlib/SRC/dsyrfsx.c b/lapack-netlib/SRC/dsyrfsx.c index d24dfc537d..05218fe35f 100644 --- a/lapack-netlib/SRC/dsyrfsx.c +++ b/lapack-netlib/SRC/dsyrfsx.c @@ -811,7 +811,7 @@ static integer c__1 = 1; /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsyrfsx_(char *uplo, char *equed, integer *n, integer * +/* Subroutine */ void dsyrfsx_(char *uplo, char *equed, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, doublereal *s, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *rcond, doublereal *berr, integer * @@ -827,7 +827,7 @@ static integer c__1 = 1; /* Local variables */ doublereal illrcond_thresh__, unstable_thresh__; - extern /* Subroutine */ int dla_syrfsx_extended_(integer *, char *, + extern /* Subroutine */ void dla_syrfsx_extended_(integer *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, logical *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, @@ -848,7 +848,7 @@ static integer c__1 = 1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dsycon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsycon_(char *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); extern integer ilaprec_(char *); @@ -966,7 +966,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYRFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -989,7 +989,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.; } } - return 0; + return; } /* Default to failure. */ @@ -1130,7 +1130,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of DSYRFSX */ diff --git a/lapack-netlib/SRC/dsysv.c b/lapack-netlib/SRC/dsysv.c index aea218b670..802c9c4ccf 100644 --- a/lapack-netlib/SRC/dsysv.c +++ b/lapack-netlib/SRC/dsysv.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \ingroup doubleSYsolve */ /* ===================================================================== */ -/* Subroutine */ int dsysv_(char *uplo, integer *n, integer *nrhs, doublereal +/* Subroutine */ void dsysv_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer *ldb, doublereal *work, integer *lwork, integer *info) { @@ -692,12 +692,13 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dsytrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dsytrf_( char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int dsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dsytrs2_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, @@ -756,9 +757,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYSV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -788,7 +789,7 @@ static integer c_n1 = -1; work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYSV */ diff --git a/lapack-netlib/SRC/dsysv.f b/lapack-netlib/SRC/dsysv.f index a6305e13c3..ed6629ad95 100644 --- a/lapack-netlib/SRC/dsysv.f +++ b/lapack-netlib/SRC/dsysv.f @@ -223,7 +223,7 @@ SUBROUTINE DSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL DSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysv_aa.c b/lapack-netlib/SRC/dsysv_aa.c index 64e89c82bf..e01611764f 100644 --- a/lapack-netlib/SRC/dsysv_aa.c +++ b/lapack-netlib/SRC/dsysv_aa.c @@ -674,7 +674,7 @@ a.f"> */ /* > \ingroup doubleSYsolve */ /* ===================================================================== */ -/* Subroutine */ int dsysv_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsysv_aa_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, doublereal *work, integer *lwork, integer *info) { @@ -684,11 +684,12 @@ a.f"> */ /* Local variables */ extern logical lsame_(char *, char *); integer lwkopt_sytrf__, lwkopt_sytrs__; - extern /* Subroutine */ int dsytrf_aa_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytrf_aa_(char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dsytrs_aa_(char *, integer *, integer *, doublereal *, integer * , integer *, doublereal *, integer *, doublereal *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -749,9 +750,9 @@ a.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYSV_AA ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U**T*T*U or A = L*T*L**T. */ @@ -768,7 +769,7 @@ a.f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYSV_AA */ diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.c b/lapack-netlib/SRC/dsysv_aa_2stage.c index 2b5177832d..16e3dcbc89 100644 --- a/lapack-netlib/SRC/dsysv_aa_2stage.c +++ b/lapack-netlib/SRC/dsysv_aa_2stage.c @@ -700,7 +700,7 @@ a_2stage.f"> */ /* > \ingroup doubleSYsolve */ /* ===================================================================== */ -/* Subroutine */ int dsysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *tb, integer *ltb, integer * ipiv, integer *ipiv2, doublereal *b, integer *ldb, doublereal *work, integer *lwork, integer *info) @@ -709,7 +709,7 @@ a_2stage.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dsytrf_aa_2stage_(char *, integer *, + extern /* Subroutine */ void dsytrf_aa_2stage_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *), dsytrs_aa_2stage_(char *, integer *, integer *, doublereal *, @@ -776,9 +776,9 @@ a_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYSV_AA_2STAGE", &i__1, (ftnlen)15); - return 0; + return; } else if (wquery || tquery) { - return 0; + return; } @@ -797,7 +797,7 @@ a_2stage.f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYSV_AA_2STAGE */ diff --git a/lapack-netlib/SRC/dsysv_rk.c b/lapack-netlib/SRC/dsysv_rk.c index ff765dafcb..76c3fd48b5 100644 --- a/lapack-netlib/SRC/dsysv_rk.c +++ b/lapack-netlib/SRC/dsysv_rk.c @@ -740,7 +740,7 @@ k.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsysv_rk_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsysv_rk_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *b, integer *ldb, doublereal *work, integer *lwork, integer *info) { @@ -748,13 +748,14 @@ k.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dsytrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_3_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dsytrf_rk_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytrf_rk_(char *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -812,9 +813,9 @@ k.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYSV_RK ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = P*U*D*(U**T)*(P**T) or */ @@ -834,7 +835,7 @@ k.f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYSV_RK */ diff --git a/lapack-netlib/SRC/dsysv_rk.f b/lapack-netlib/SRC/dsysv_rk.f index 05d8f7d3ff..db8fd36ddd 100644 --- a/lapack-netlib/SRC/dsysv_rk.f +++ b/lapack-netlib/SRC/dsysv_rk.f @@ -280,7 +280,7 @@ SUBROUTINE DSYSV_RK( UPLO, N, NRHS, A, LDA, E, IPIV, B, LDB, LWKOPT = 1 ELSE CALL DSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysv_rook.c b/lapack-netlib/SRC/dsysv_rook.c index c63bc088b3..7d813426f0 100644 --- a/lapack-netlib/SRC/dsysv_rook.c +++ b/lapack-netlib/SRC/dsysv_rook.c @@ -717,7 +717,7 @@ ook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsysv_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsysv_rook_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, doublereal *work, integer *lwork, integer *info) { @@ -725,7 +725,7 @@ ook.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dsytrf_rook_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytrf_rook_(char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *) , dsytrs_rook_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); @@ -787,9 +787,9 @@ ook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYSV_ROOK ", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -808,7 +808,7 @@ ook.f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYSV_ROOK */ diff --git a/lapack-netlib/SRC/dsysv_rook.f b/lapack-netlib/SRC/dsysv_rook.f index 6ebb52eae4..85f2933092 100644 --- a/lapack-netlib/SRC/dsysv_rook.f +++ b/lapack-netlib/SRC/dsysv_rook.f @@ -256,7 +256,7 @@ SUBROUTINE DSYSV_ROOK( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL DSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/dsysvx.c b/lapack-netlib/SRC/dsysvx.c index 0f91ca5885..da5c5ebe35 100644 --- a/lapack-netlib/SRC/dsysvx.c +++ b/lapack-netlib/SRC/dsysvx.c @@ -796,7 +796,7 @@ f"> */ /* > \ingroup doubleSYsolve */ /* ===================================================================== */ -/* Subroutine */ int dsysvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void dsysvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, doublereal *b, integer *ldb, doublereal *x, integer * ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, @@ -812,14 +812,14 @@ f"> */ integer nb; extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern doublereal dlansy_(char *, char *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dsycon_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsycon_(char *, integer *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *), dsyrfs_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, @@ -829,7 +829,7 @@ f"> */ doublereal *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int dsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrs_(char *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); @@ -910,9 +910,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYSVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } if (nofact) { @@ -927,7 +927,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -961,7 +961,7 @@ f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYSVX */ diff --git a/lapack-netlib/SRC/dsysvxx.c b/lapack-netlib/SRC/dsysvxx.c index 01fc0e7d4d..66978788a3 100644 --- a/lapack-netlib/SRC/dsysvxx.c +++ b/lapack-netlib/SRC/dsysvxx.c @@ -1011,7 +1011,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleSYsolve */ /* ===================================================================== */ -/* Subroutine */ int dsysvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void dsysvxx_(char *fact, char *uplo, integer *n, integer * nrhs, doublereal *a, integer *lda, doublereal *af, integer *ldaf, integer *ipiv, char *equed, doublereal *s, doublereal *b, integer * ldb, doublereal *x, integer *ldx, doublereal *rcond, doublereal * @@ -1035,15 +1035,15 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical equil, rcequ; extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer infequ; - extern /* Subroutine */ int dlaqsy_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlaqsy_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, char *); doublereal smlnum; - extern /* Subroutine */ int dsytrf_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytrf_(char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlascl2_(integer *, integer *, doublereal *, doublereal *, integer *), dsytrs_(char *, integer *, integer *, doublereal *, @@ -1163,7 +1163,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DSYSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1208,7 +1208,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = dla_syrpvgrw_(uplo, n, info, &a[a_offset], lda, & af[af_offset], ldaf, &ipiv[1], &work[1]); } - return 0; + return; } } @@ -1240,7 +1240,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ dlascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of DSYSVXX */ diff --git a/lapack-netlib/SRC/dsyswapr.c b/lapack-netlib/SRC/dsyswapr.c index 55c158c977..e239f7858f 100644 --- a/lapack-netlib/SRC/dsyswapr.c +++ b/lapack-netlib/SRC/dsyswapr.c @@ -616,7 +616,7 @@ r.f"> */ /* > \ingroup doubleSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int dsyswapr_(char *uplo, integer *n, doublereal *a, integer +/* Subroutine */ void dsyswapr_(char *uplo, integer *n, doublereal *a, integer *lda, integer *i1, integer *i2) { /* System generated locals */ @@ -625,7 +625,7 @@ r.f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical upper; doublereal tmp; @@ -712,6 +712,6 @@ r.f"> */ } } - return 0; + return; } /* dsyswapr_ */ diff --git a/lapack-netlib/SRC/dsyswapr.f b/lapack-netlib/SRC/dsyswapr.f index c60ccbefc3..93f6195f22 100644 --- a/lapack-netlib/SRC/dsyswapr.f +++ b/lapack-netlib/SRC/dsyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is DOUBLE PRECISION array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by DSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is DOUBLE PRECISION array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ SUBROUTINE DSYSWAPR( UPLO, N, A, LDA, I1, I2) INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - DOUBLE PRECISION A( LDA, N ) + DOUBLE PRECISION A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I DOUBLE PRECISION TMP * * .. External Functions .. @@ -143,19 +140,12 @@ SUBROUTINE DSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL DSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL DSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ SUBROUTINE DSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL DSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL DSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE DSYSWAPR diff --git a/lapack-netlib/SRC/dsytd2.c b/lapack-netlib/SRC/dsytd2.c index a717bfd7e3..48c357648a 100644 --- a/lapack-netlib/SRC/dsytd2.c +++ b/lapack-netlib/SRC/dsytd2.c @@ -689,7 +689,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsytd2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsytd2_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *d__, doublereal *e, doublereal *tau, integer *info) { /* System generated locals */ @@ -699,19 +699,20 @@ f"> */ extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal taui; - extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__; doublereal alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dlarfg_(integer *, doublereal *, - doublereal *, integer *, doublereal *), xerbla_(char *, integer *, ftnlen + doublereal *, integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen ); @@ -747,13 +748,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTD2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -856,7 +857,7 @@ f"> */ d__[*n] = a[*n + *n * a_dim1]; } - return 0; + return; /* End of DSYTD2 */ diff --git a/lapack-netlib/SRC/dsytf2.c b/lapack-netlib/SRC/dsytf2.c index e30a6d498e..421bc62d7e 100644 --- a/lapack-netlib/SRC/dsytf2.c +++ b/lapack-netlib/SRC/dsytf2.c @@ -708,7 +708,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytf2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsytf2_(char *uplo, integer *n, doublereal *a, integer * lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -717,14 +717,14 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i__, j, k; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; logical upper; @@ -767,7 +767,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1168,7 +1168,7 @@ f"> */ L70: - return 0; + return; /* End of DSYTF2 */ diff --git a/lapack-netlib/SRC/dsytf2_rk.c b/lapack-netlib/SRC/dsytf2_rk.c index adfbdca98d..0cf7a77264 100644 --- a/lapack-netlib/SRC/dsytf2_rk.c +++ b/lapack-netlib/SRC/dsytf2_rk.c @@ -755,7 +755,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytf2_rk_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytf2_rk_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -765,16 +765,16 @@ rk.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i__, j, k, p; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; logical upper; @@ -819,7 +819,7 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTF2_RK", &i__1, (ftnlen)9); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1534,7 +1534,7 @@ rk.f"> */ ; } - return 0; + return; /* End of DSYTF2_RK */ diff --git a/lapack-netlib/SRC/dsytf2_rook.c b/lapack-netlib/SRC/dsytf2_rook.c index a549180ef1..a6e93428d6 100644 --- a/lapack-netlib/SRC/dsytf2_rook.c +++ b/lapack-netlib/SRC/dsytf2_rook.c @@ -708,7 +708,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytf2_rook_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytf2_rook_(char *uplo, integer *n, doublereal *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -718,16 +718,16 @@ rook.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int dsyr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer i__, j, k, p; doublereal t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; logical upper; @@ -771,7 +771,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTF2_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1390,7 +1390,7 @@ rook.f"> */ L70: - return 0; + return; /* End of DSYTF2_ROOK */ diff --git a/lapack-netlib/SRC/dsytrd.c b/lapack-netlib/SRC/dsytrd.c index a4eca3bbb9..da0310281c 100644 --- a/lapack-netlib/SRC/dsytrd.c +++ b/lapack-netlib/SRC/dsytrd.c @@ -710,7 +710,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsytrd_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsytrd_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *d__, doublereal *e, doublereal *tau, doublereal * work, integer *lwork, integer *info) { @@ -722,14 +722,15 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int dsytd2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytd2_(char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *), dsyr2k_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nb, kk, nx; - extern /* Subroutine */ int dlatrd_(char *, integer *, integer *, + extern /* Subroutine */ void dlatrd_(char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -784,16 +785,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.; - return 0; + return; } nx = *n; @@ -920,7 +921,7 @@ f"> */ } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYTRD */ diff --git a/lapack-netlib/SRC/dsytrd_2stage.c b/lapack-netlib/SRC/dsytrd_2stage.c index eaa12cf6ae..100864b635 100644 --- a/lapack-netlib/SRC/dsytrd_2stage.c +++ b/lapack-netlib/SRC/dsytrd_2stage.c @@ -740,7 +740,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsytrd_2stage_(char *vect, char *uplo, integer *n, +/* Subroutine */ void dsytrd_2stage_(char *vect, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *d__, doublereal *e, doublereal *tau, doublereal *hous2, integer *lhous2, doublereal *work, integer *lwork, integer *info) @@ -753,12 +753,12 @@ static integer c__4 = 4; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); integer lwrk; - extern /* Subroutine */ int dsytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void dsytrd_sb2st_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer wpos; - extern /* Subroutine */ int dsytrd_sy2sb_(char *, integer *, integer *, + extern /* Subroutine */ void dsytrd_sy2sb_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); extern logical lsame_(char *, char *); @@ -827,16 +827,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.; - return 0; + return; } /* Determine pointer position */ @@ -850,20 +850,20 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRD_SY2SB", &i__1, (ftnlen)12); - return 0; + return; } dsytrd_sb2st_("Y", vect, uplo, n, &kd, &work[abpos], &ldab, &d__[1], &e[ 1], &hous2[1], lhous2, &work[wpos], &lwrk, info); if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRD_SB2ST", &i__1, (ftnlen)12); - return 0; + return; } hous2[1] = (doublereal) lhmin; work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSYTRD_2STAGE */ diff --git a/lapack-netlib/SRC/dsytrd_sb2st.c b/lapack-netlib/SRC/dsytrd_sb2st.c index 37f1bb4f97..e8f2bf0a55 100644 --- a/lapack-netlib/SRC/dsytrd_sb2st.c +++ b/lapack-netlib/SRC/dsytrd_sb2st.c @@ -746,7 +746,7 @@ sb2st.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsytrd_sb2st_(char *stage1, char *vect, char *uplo, +/* Subroutine */ void dsytrd_sb2st_(char *stage1, char *vect, char *uplo, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal * d__, doublereal *e, doublereal *hous, integer *lhous, doublereal * work, integer *lwork, integer *info) @@ -765,17 +765,18 @@ sb2st.f"> */ integer lhmin, sidev, sizea, shift, stind, colpt, lwmin, awpos; logical wantq, upper; integer grsiz, ttype, stepercol, ed, ib, st, abdpos; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer thgrid; - extern /* Subroutine */ int dsb2st_kernels_(char *, logical *, integer *, + extern /* Subroutine */ void dsb2st_kernels_(char *, logical *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *); integer thgrnb, indtau, ofdpos, blklastind; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery, afters1; integer lda, tid, ldv, stt, sweepid, nbtiles, sizetau, thgrsiz; @@ -843,9 +844,9 @@ sb2st.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRD_SB2ST", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -853,7 +854,7 @@ sb2st.f"> */ if (*n == 0) { hous[1] = 1.; work[1] = 1.; - return 0; + return; } /* Determine pointer position */ @@ -906,7 +907,7 @@ sb2st.f"> */ hous[1] = 1.; work[1] = 1.; - return 0; + return; } /* Case KD=1: */ @@ -942,7 +943,7 @@ sb2st.f"> */ hous[1] = 1.; work[1] = 1.; - return 0; + return; } /* Main code start here. */ @@ -1061,7 +1062,7 @@ sb2st.f"> */ hous[1] = (doublereal) lhmin; work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSYTRD_SB2ST */ diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.c b/lapack-netlib/SRC/dsytrd_sy2sb.c index f272d47528..b98cecf64f 100644 --- a/lapack-netlib/SRC/dsytrd_sy2sb.c +++ b/lapack-netlib/SRC/dsytrd_sy2sb.c @@ -761,7 +761,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsytrd_sy2sb_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void dsytrd_sy2sb_(char *uplo, integer *n, integer *kd, doublereal *a, integer *lda, doublereal *ab, integer *ldab, doublereal *tau, doublereal *work, integer *lwork, integer *info) { @@ -773,30 +773,31 @@ f"> */ extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); integer tpos, wpos, s1pos, s2pos, i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; - extern /* Subroutine */ int dsymm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsymm_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dsyr2k_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dsyr2k_(char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer lk, pk, pn, lt; - extern /* Subroutine */ int dgelqf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgelqf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); integer lw; - extern /* Subroutine */ int dgeqrf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dgeqrf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *), dlarft_(char *, char *, integer *, integer *, doublereal *, - integer *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dlaset_(char *, integer *, + integer *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer ls1; logical lquery; @@ -852,10 +853,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRD_SY2SB", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { work[1] = (doublereal) lwmin; - return 0; + return; } /* Quick return if possible */ @@ -884,7 +885,7 @@ f"> */ } } work[1] = 1.; - return 0; + return; } /* Determine the pointer position for the workspace */ @@ -1076,7 +1077,7 @@ f"> */ } work[1] = (doublereal) lwmin; - return 0; + return; /* End of DSYTRD_SY2SB */ diff --git a/lapack-netlib/SRC/dsytrf.c b/lapack-netlib/SRC/dsytrf.c index 250095d91d..f2078bdb13 100644 --- a/lapack-netlib/SRC/dsytrf.c +++ b/lapack-netlib/SRC/dsytrf.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dsytrf_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsytrf_(char *uplo, integer *n, doublereal *a, integer * lda, integer *ipiv, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -708,13 +708,13 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int dsytf2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytf2_(char *, integer *, doublereal *, integer *, integer *, integer *); integer kb, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlasyf_(char *, integer *, integer *, integer + extern /* Subroutine */ void dlasyf_(char *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer ldwork, lwkopt; @@ -767,9 +767,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -896,7 +896,7 @@ f"> */ L40: work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYTRF */ diff --git a/lapack-netlib/SRC/dsytrf_aa.c b/lapack-netlib/SRC/dsytrf_aa.c index eafab6749b..1f56e2f7c6 100644 --- a/lapack-netlib/SRC/dsytrf_aa.c +++ b/lapack-netlib/SRC/dsytrf_aa.c @@ -648,7 +648,7 @@ aa.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytrf_aa_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytrf_aa_(char *uplo, integer *n, doublereal *a, integer *lda, integer *ipiv, doublereal *work, integer *lwork, integer *info) { @@ -658,12 +658,12 @@ aa.f"> */ /* Local variables */ integer j; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dlasyf_aa_(char *, integer *, integer *, + extern /* Subroutine */ void dlasyf_aa_(char *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, @@ -730,19 +730,19 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRF_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } ipiv[1] = 1; if (*n == 1) { - return 0; + return; } /* Adjust block size based on the workspace size */ @@ -1030,7 +1030,7 @@ aa.f"> */ } L20: - return 0; + return; /* End of DSYTRF_AA */ diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.c b/lapack-netlib/SRC/dsytrf_aa_2stage.c index e66d6b2f3b..24b213c709 100644 --- a/lapack-netlib/SRC/dsytrf_aa_2stage.c +++ b/lapack-netlib/SRC/dsytrf_aa_2stage.c @@ -676,7 +676,7 @@ aa_2stage.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytrf_aa_2stage_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytrf_aa_2stage_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal *tb, integer *ltb, integer *ipiv, integer * ipiv2, doublereal *work, integer *lwork, integer *info) { @@ -685,12 +685,12 @@ aa_2stage.f"> */ /* Local variables */ integer ldtb, i__, j, k; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dtrsm_(char *, char *, char *, char * , integer *, integer *, doublereal *, doublereal *, integer *, @@ -698,15 +698,16 @@ aa_2stage.f"> */ integer i1; logical upper; integer i2, jb, kb, nb, td, nt; - extern /* Subroutine */ int dgbtrf_(integer *, integer *, integer *, - integer *, doublereal *, integer *, integer *, integer *), - dgetrf_(integer *, integer *, doublereal *, integer *, integer *, - integer *), dlacpy_(char *, integer *, integer *, doublereal *, - integer *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dgbtrf_(integer *, integer *, integer *, + integer *, doublereal *, integer *, integer *, integer *); + extern int dgetrf_(integer *, integer *, doublereal *, integer *, integer *, + integer *); + extern void dlacpy_(char *, integer *, integer *, doublereal *, + integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dsygst_(integer *, char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -755,7 +756,7 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRF_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Answer the query */ @@ -771,13 +772,13 @@ aa_2stage.f"> */ } } if (tquery || wquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } /* Determine the number of the block size */ @@ -1232,7 +1233,7 @@ aa_2stage.f"> */ /* Factor the band matrix */ dgbtrf_(n, n, &nb, &nb, &tb[1], &ldtb, &ipiv2[1], info); - return 0; + return; /* End of DSYTRF_AA_2STAGE */ diff --git a/lapack-netlib/SRC/dsytrf_rk.c b/lapack-netlib/SRC/dsytrf_rk.c index ac483dd72a..dcb9aeefff 100644 --- a/lapack-netlib/SRC/dsytrf_rk.c +++ b/lapack-netlib/SRC/dsytrf_rk.c @@ -774,7 +774,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytrf_rk_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytrf_rk_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *work, integer *lwork, integer *info) { @@ -783,14 +783,14 @@ rk.f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int dsytf2_rk_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytf2_rk_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmin, iinfo; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlasyf_rk_(char *, integer *, integer *, + extern /* Subroutine */ void dlasyf_rk_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer kb, nb, ip; @@ -848,9 +848,9 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRF_RK", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -1036,7 +1036,7 @@ rk.f"> */ } work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYTRF_RK */ diff --git a/lapack-netlib/SRC/dsytrf_rook.c b/lapack-netlib/SRC/dsytrf_rook.c index ab01cd076b..57f8e4e64e 100644 --- a/lapack-netlib/SRC/dsytrf_rook.c +++ b/lapack-netlib/SRC/dsytrf_rook.c @@ -723,7 +723,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytrf_rook_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytrf_rook_(char *uplo, integer *n, doublereal *a, integer *lda, integer *ipiv, doublereal *work, integer *lwork, integer *info) { @@ -742,7 +742,7 @@ rook.f"> */ integer ldwork, lwkopt; logical lquery; integer iws; - extern /* Subroutine */ int dsytf2_rook_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytf2_rook_(char *, integer *, doublereal *, integer *, integer *, integer *), dlasyf_rook_(char *, integer *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); @@ -795,9 +795,9 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRF_ROOK", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -927,7 +927,7 @@ rook.f"> */ L40: work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYTRF_ROOK */ diff --git a/lapack-netlib/SRC/dsytri.c b/lapack-netlib/SRC/dsytri.c index e5c6a1c004..0311e9e1eb 100644 --- a/lapack-netlib/SRC/dsytri.c +++ b/lapack-netlib/SRC/dsytri.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytri_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsytri_(char *uplo, integer *n, doublereal *a, integer * lda, integer *ipiv, doublereal *work, integer *info) { /* System generated locals */ @@ -643,12 +643,12 @@ f"> */ integer k; doublereal t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal ak; @@ -688,13 +688,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -705,7 +705,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -716,7 +716,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } /* L20: */ } @@ -941,7 +941,7 @@ f"> */ ; } - return 0; + return; /* End of DSYTRI */ diff --git a/lapack-netlib/SRC/dsytri2.c b/lapack-netlib/SRC/dsytri2.c index 2a05e8ea2a..9f453cc054 100644 --- a/lapack-netlib/SRC/dsytri2.c +++ b/lapack-netlib/SRC/dsytri2.c @@ -641,14 +641,14 @@ static integer c_n1 = -1; /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytri2_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dsytri2_(char *uplo, integer *n, doublereal *a, integer * lda, integer *ipiv, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int dsytri2x_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytri2x_(char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmax; @@ -656,7 +656,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dsytri_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytri_(char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *); logical lquery; integer minsize; @@ -709,13 +709,13 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRI2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { work[1] = (doublereal) minsize; - return 0; + return; } if (*n == 0) { - return 0; + return; } if (nbmax >= *n) { dsytri_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], info); @@ -723,7 +723,7 @@ static integer c_n1 = -1; dsytri2x_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], &nbmax, info); } - return 0; + return; /* End of DSYTRI2 */ diff --git a/lapack-netlib/SRC/dsytri2x.c b/lapack-netlib/SRC/dsytri2x.c index 11e3ffc475..6bab331f06 100644 --- a/lapack-netlib/SRC/dsytri2x.c +++ b/lapack-netlib/SRC/dsytri2x.c @@ -634,7 +634,7 @@ x.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytri2x_(char *uplo, integer *n, doublereal *a, integer +/* Subroutine */ void dsytri2x_(char *uplo, integer *n, doublereal *a, integer *lda, integer *ipiv, doublereal *work, integer *nb, integer *info) { /* System generated locals */ @@ -643,17 +643,17 @@ x.f"> */ /* Local variables */ integer invd; doublereal akkp1; - extern /* Subroutine */ int dsyswapr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyswapr_(char *, integer *, doublereal *, integer *, integer *, integer *); doublereal d__; integer i__, j, k; doublereal t; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer count; @@ -662,11 +662,12 @@ x.f"> */ integer u11; doublereal u11_i_j__; integer ip; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dtrtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int dtrtri_( char *, char *, integer *, doublereal *, integer *, integer *); integer nnb, cut; doublereal akp1; - extern /* Subroutine */ int dsyconv_(char *, char *, integer *, + extern /* Subroutine */ void dsyconv_(char *, char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *); doublereal u01_ip1_j__, u11_ip1_j__; @@ -708,10 +709,10 @@ x.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRI2X", &i__1, (ftnlen)8); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Convert A */ @@ -728,7 +729,7 @@ x.f"> */ for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } } } else { @@ -738,7 +739,7 @@ x.f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } } } @@ -1210,7 +1211,7 @@ x.f"> */ } } - return 0; + return; /* End of DSYTRI2X */ diff --git a/lapack-netlib/SRC/dsytri_3.c b/lapack-netlib/SRC/dsytri_3.c index 716a91169e..8ce325fea9 100644 --- a/lapack-netlib/SRC/dsytri_3.c +++ b/lapack-netlib/SRC/dsytri_3.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytri_3_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytri_3_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *work, integer *lwork, integer *info) { @@ -692,7 +692,7 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int dsytri_3x_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsytri_3x_(char *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); logical upper; @@ -749,16 +749,16 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRI_3", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1] = (doublereal) lwkopt; - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } dsytri_3x_(uplo, n, &a[a_offset], lda, &e[1], &ipiv[1], &work[1], &nb, @@ -766,7 +766,7 @@ static integer c_n1 = -1; work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DSYTRI_3 */ diff --git a/lapack-netlib/SRC/dsytri_3x.c b/lapack-netlib/SRC/dsytri_3x.c index 669af1c68f..1d7a23f9f6 100644 --- a/lapack-netlib/SRC/dsytri_3x.c +++ b/lapack-netlib/SRC/dsytri_3x.c @@ -673,7 +673,7 @@ static doublereal c_b14 = 0.; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytri_3x_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytri_3x_(char *uplo, integer *n, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *work, integer *nb, integer *info) { @@ -683,16 +683,16 @@ static doublereal c_b14 = 0.; /* Local variables */ integer invd; doublereal akkp1; - extern /* Subroutine */ int dsyswapr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyswapr_(char *, integer *, doublereal *, integer *, integer *, integer *); doublereal d__; integer i__, j, k; doublereal t; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; @@ -745,10 +745,10 @@ static doublereal c_b14 = 0.; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRI_3X", &i__1, (ftnlen)9); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Workspace got Non-diag elements of D */ @@ -766,7 +766,7 @@ static doublereal c_b14 = 0.; for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } } } else { @@ -776,7 +776,7 @@ static doublereal c_b14 = 0.; i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } } } @@ -1259,7 +1259,7 @@ static doublereal c_b14 = 0.; } - return 0; + return; /* End of DSYTRI_3X */ diff --git a/lapack-netlib/SRC/dsytri_rook.c b/lapack-netlib/SRC/dsytri_rook.c index df6cce4151..16adecc7d6 100644 --- a/lapack-netlib/SRC/dsytri_rook.c +++ b/lapack-netlib/SRC/dsytri_rook.c @@ -644,7 +644,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytri_rook_(char *uplo, integer *n, doublereal *a, +/* Subroutine */ void dsytri_rook_(char *uplo, integer *n, doublereal *a, integer *lda, integer *ipiv, doublereal *work, integer *info) { /* System generated locals */ @@ -658,12 +658,12 @@ rook.f"> */ integer k; doublereal t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int dsymv_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsymv_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal ak; @@ -703,13 +703,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRI_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -720,7 +720,7 @@ rook.f"> */ for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -731,7 +731,7 @@ rook.f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.) { - return 0; + return; } /* L20: */ } @@ -1030,7 +1030,7 @@ rook.f"> */ ; } - return 0; + return; /* End of DSYTRI_ROOK */ diff --git a/lapack-netlib/SRC/dsytrs.c b/lapack-netlib/SRC/dsytrs.c index 922f4c593e..69903ec36f 100644 --- a/lapack-netlib/SRC/dsytrs.c +++ b/lapack-netlib/SRC/dsytrs.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsytrs_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, integer *info) { @@ -644,16 +644,16 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal akm1k; integer j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal denom; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); @@ -699,13 +699,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1003,7 +1003,7 @@ f"> */ ; } - return 0; + return; /* End of DSYTRS */ diff --git a/lapack-netlib/SRC/dsytrs2.c b/lapack-netlib/SRC/dsytrs2.c index a9c03d96dc..62a8f545ae 100644 --- a/lapack-netlib/SRC/dsytrs2.c +++ b/lapack-netlib/SRC/dsytrs2.c @@ -644,7 +644,7 @@ static doublereal c_b10 = 1.; /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytrs2_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsytrs2_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, doublereal *work, integer *info) { @@ -655,12 +655,12 @@ static doublereal c_b10 = 1.; /* Local variables */ doublereal akm1k; integer i__, j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal denom; integer iinfo; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -669,7 +669,7 @@ static doublereal c_b10 = 1.; integer kp; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal akm1, bkm1; - extern /* Subroutine */ int dsyconv_(char *, char *, integer *, + extern /* Subroutine */ void dsyconv_(char *, char *, integer *, doublereal *, integer *, integer *, doublereal *, integer *); @@ -709,13 +709,13 @@ static doublereal c_b10 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRS2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Convert A */ @@ -900,7 +900,7 @@ static doublereal c_b10 = 1.; dsyconv_(uplo, "R", n, &a[a_offset], lda, &ipiv[1], &work[1], &iinfo); - return 0; + return; /* End of DSYTRS2 */ diff --git a/lapack-netlib/SRC/dsytrs_3.c b/lapack-netlib/SRC/dsytrs_3.c index dd8198422f..ee48613c4b 100644 --- a/lapack-netlib/SRC/dsytrs_3.c +++ b/lapack-netlib/SRC/dsytrs_3.c @@ -677,7 +677,7 @@ static doublereal c_b9 = 1.; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytrs_3_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsytrs_3_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *e, integer *ipiv, doublereal *b, integer *ldb, integer *info) { @@ -688,11 +688,11 @@ static doublereal c_b9 = 1.; /* Local variables */ doublereal akm1k; integer i__, j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal denom; - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); @@ -739,13 +739,13 @@ static doublereal c_b9 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRS_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -897,7 +897,7 @@ static doublereal c_b9 = 1.; } - return 0; + return; /* End of DSYTRS_3 */ diff --git a/lapack-netlib/SRC/dsytrs_aa.c b/lapack-netlib/SRC/dsytrs_aa.c index ece9814336..bbf2252590 100644 --- a/lapack-netlib/SRC/dsytrs_aa.c +++ b/lapack-netlib/SRC/dsytrs_aa.c @@ -644,7 +644,7 @@ aa.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytrs_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsytrs_aa_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, doublereal *work, integer *lwork, integer *info) { @@ -654,16 +654,16 @@ aa.f"> */ /* Local variables */ integer k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dswap_(integer *, doublereal *, integer *, doublereal *, integer *), dgtsv_(integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *) , dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; integer kp; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -712,17 +712,17 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRS_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { lwkopt = *n * 3 - 2; work[1] = (doublereal) lwkopt; - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -854,7 +854,7 @@ aa.f"> */ } - return 0; + return; /* End of DSYTRS_AA */ diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.c b/lapack-netlib/SRC/dsytrs_aa_2stage.c index 5a98a3c60d..3153ef806e 100644 --- a/lapack-netlib/SRC/dsytrs_aa_2stage.c +++ b/lapack-netlib/SRC/dsytrs_aa_2stage.c @@ -653,7 +653,7 @@ aa_2stage.f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dsytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *tb, integer *ltb, integer * ipiv, integer *ipiv2, doublereal *b, integer *ldb, integer *info) { @@ -663,15 +663,16 @@ aa_2stage.f"> */ /* Local variables */ integer ldtb; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; integer nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dgbtrs_( char *, integer *, integer *, integer *, integer *, doublereal *, - integer *, integer *, doublereal *, integer *, integer *), - dlaswp_(integer *, doublereal *, integer *, integer *, integer *, + integer *, integer *, doublereal *, integer *, integer *); + extern int dlaswp_(integer *, doublereal *, integer *, integer *, integer *, integer *, integer *); @@ -715,13 +716,13 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRS_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Read NB and compute LDTB */ @@ -806,7 +807,7 @@ aa_2stage.f"> */ } } - return 0; + return; /* End of DSYTRS_AA_2STAGE */ diff --git a/lapack-netlib/SRC/dsytrs_rook.c b/lapack-netlib/SRC/dsytrs_rook.c index 87eae958f6..af9b1f43a2 100644 --- a/lapack-netlib/SRC/dsytrs_rook.c +++ b/lapack-netlib/SRC/dsytrs_rook.c @@ -650,7 +650,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dsytrs_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void dsytrs_rook_(char *uplo, integer *n, integer *nrhs, doublereal *a, integer *lda, integer *ipiv, doublereal *b, integer * ldb, integer *info) { @@ -659,16 +659,16 @@ rook.f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal akm1k; integer j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal denom; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dswap_(integer *, doublereal *, integer *, doublereal *, integer *); @@ -714,13 +714,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DSYTRS_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1046,7 +1046,7 @@ rook.f"> */ ; } - return 0; + return; /* End of DSYTRS_ROOK */ diff --git a/lapack-netlib/SRC/dtbcon.c b/lapack-netlib/SRC/dtbcon.c index 279e546ed8..0c5807e784 100644 --- a/lapack-netlib/SRC/dtbcon.c +++ b/lapack-netlib/SRC/dtbcon.c @@ -655,7 +655,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtbcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void dtbcon_(char *norm, char *uplo, char *diag, integer *n, integer *kd, doublereal *ab, integer *ldab, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -668,21 +668,22 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); doublereal anorm; logical upper; doublereal xnorm; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; extern integer idamax_(integer *, doublereal *, integer *); extern doublereal dlantb_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); - extern /* Subroutine */ int dlatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatbs_(char *, char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; logical onenrm; char normin[1]; @@ -730,14 +731,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.; - return 0; + return; } *rcond = 0.; @@ -802,7 +803,7 @@ f"> */ } L20: - return 0; + return; /* End of DTBCON */ diff --git a/lapack-netlib/SRC/dtbrfs.c b/lapack-netlib/SRC/dtbrfs.c index 3817e092d4..19ac8469c1 100644 --- a/lapack-netlib/SRC/dtbrfs.c +++ b/lapack-netlib/SRC/dtbrfs.c @@ -701,7 +701,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtbrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void dtbrfs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, doublereal *ab, integer *ldab, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) @@ -718,14 +718,14 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dtbmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtbmv_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer * , doublereal *, integer *), dtbsv_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal * , doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); doublereal xk; @@ -793,7 +793,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -805,7 +805,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1091,7 +1091,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of DTBRFS */ diff --git a/lapack-netlib/SRC/dtbtrs.c b/lapack-netlib/SRC/dtbtrs.c index 9a36a3b57a..6da24c48f6 100644 --- a/lapack-netlib/SRC/dtbtrs.c +++ b/lapack-netlib/SRC/dtbtrs.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtbtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void dtbtrs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, doublereal *ab, integer *ldab, doublereal *b, integer *ldb, integer *info) { @@ -668,7 +668,7 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtbsv_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); logical upper; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -719,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -735,7 +735,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ab[*kd + 1 + *info * ab_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -743,7 +743,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ab[*info * ab_dim1 + 1] == 0.) { - return 0; + return; } /* L20: */ } @@ -760,7 +760,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of DTBTRS */ diff --git a/lapack-netlib/SRC/dtfsm.c b/lapack-netlib/SRC/dtfsm.c index 068c9afcab..bc37a7ccce 100644 --- a/lapack-netlib/SRC/dtfsm.c +++ b/lapack-netlib/SRC/dtfsm.c @@ -790,7 +790,7 @@ static doublereal c_b27 = 1.; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dtfsm_(char *transr, char *side, char *uplo, char *trans, +/* Subroutine */ void dtfsm_(char *transr, char *side, char *uplo, char *trans, char *diag, integer *m, integer *n, doublereal *alpha, doublereal *a, doublereal *b, integer *ldb) { @@ -800,13 +800,13 @@ static doublereal c_b27 = 1.; /* Local variables */ integer info, i__, j, k; logical normaltransr; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical lside; extern logical lsame_(char *, char *); logical lower; - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer m1, m2, n1, n2; @@ -857,13 +857,13 @@ static doublereal c_b27 = 1.; if (info != 0) { i__1 = -info; xerbla_("DTFSM ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return when ( (N.EQ.0).OR.(M.EQ.0) ) */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Quick return when ALPHA.EQ.(0D+0) */ @@ -878,7 +878,7 @@ static doublereal c_b27 = 1.; } /* L20: */ } - return 0; + return; } if (lside) { @@ -1528,7 +1528,7 @@ static doublereal c_b27 = 1.; } } - return 0; + return; /* End of DTFSM */ diff --git a/lapack-netlib/SRC/dtftri.c b/lapack-netlib/SRC/dtftri.c index 64f4989785..86b63defa4 100644 --- a/lapack-netlib/SRC/dtftri.c +++ b/lapack-netlib/SRC/dtftri.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtftri_(char *transr, char *uplo, char *diag, integer *n, +/* Subroutine */ void dtftri_(char *transr, char *uplo, char *diag, integer *n, doublereal *a, integer *info) { /* System generated locals */ @@ -725,7 +725,7 @@ f"> */ integer k; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical lower; @@ -763,13 +763,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -811,7 +811,7 @@ f"> */ dtrtri_("L", diag, &n1, a, n, info); if (*info > 0) { - return 0; + return; } dtrmm_("R", "L", "N", diag, &n2, &n1, &c_b13, a, n, &a[n1], n); dtrtri_("U", diag, &n2, &a[*n], n, info) @@ -820,7 +820,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } dtrmm_("L", "U", "T", diag, &n2, &n1, &c_b18, &a[*n], n, &a[ n1], n); @@ -834,7 +834,7 @@ f"> */ dtrtri_("L", diag, &n1, &a[n2], n, info) ; if (*info > 0) { - return 0; + return; } dtrmm_("L", "L", "T", diag, &n1, &n2, &c_b13, &a[n2], n, a, n); dtrtri_("U", diag, &n2, &a[n1], n, info) @@ -843,7 +843,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } dtrmm_("R", "U", "N", diag, &n1, &n2, &c_b18, &a[n1], n, a, n); @@ -860,7 +860,7 @@ f"> */ dtrtri_("U", diag, &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } dtrmm_("L", "U", "N", diag, &n1, &n2, &c_b13, a, &n1, &a[n1 * n1], &n1); @@ -869,7 +869,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } dtrmm_("R", "L", "T", diag, &n1, &n2, &c_b18, &a[1], &n1, &a[ n1 * n1], &n1); @@ -881,7 +881,7 @@ f"> */ dtrtri_("U", diag, &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } dtrmm_("R", "U", "T", diag, &n2, &n1, &c_b13, &a[n2 * n2], & n2, a, &n2); @@ -890,7 +890,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } dtrmm_("L", "L", "N", diag, &n2, &n1, &c_b18, &a[n1 * n2], & n2, a, &n2); @@ -915,7 +915,7 @@ f"> */ i__1 = *n + 1; dtrtri_("L", diag, &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -927,7 +927,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -944,7 +944,7 @@ f"> */ i__1 = *n + 1; dtrtri_("L", diag, &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -956,7 +956,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -975,7 +975,7 @@ f"> */ dtrtri_("U", diag, &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } dtrmm_("L", "U", "N", diag, &k, &k, &c_b13, &a[k], &k, &a[k * (k + 1)], &k); @@ -984,7 +984,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } dtrmm_("R", "L", "T", diag, &k, &k, &c_b18, a, &k, &a[k * (k + 1)], &k) @@ -997,7 +997,7 @@ f"> */ dtrtri_("U", diag, &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } dtrmm_("R", "U", "T", diag, &k, &k, &c_b13, &a[k * (k + 1)], & k, a, &k); @@ -1006,7 +1006,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } dtrmm_("L", "L", "N", diag, &k, &k, &c_b18, &a[k * k], &k, a, &k); @@ -1014,7 +1014,7 @@ f"> */ } } - return 0; + return; /* End of DTFTRI */ diff --git a/lapack-netlib/SRC/dtfttp.c b/lapack-netlib/SRC/dtfttp.c index 68f32ab0b6..d1dd7891ea 100644 --- a/lapack-netlib/SRC/dtfttp.c +++ b/lapack-netlib/SRC/dtfttp.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtfttp_(char *transr, char *uplo, integer *n, doublereal +/* Subroutine */ void dtfttp_(char *transr, char *uplo, integer *n, doublereal *arf, doublereal *ap, integer *info) { /* System generated locals */ @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTFTTP", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -753,7 +753,7 @@ f"> */ } else { ap[0] = arf[0]; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1057,7 +1057,7 @@ f"> */ } - return 0; + return; /* End of DTFTTP */ diff --git a/lapack-netlib/SRC/dtfttr.c b/lapack-netlib/SRC/dtfttr.c index 6d169bb234..1054894830 100644 --- a/lapack-netlib/SRC/dtfttr.c +++ b/lapack-netlib/SRC/dtfttr.c @@ -706,7 +706,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dtfttr_(char *transr, char *uplo, integer *n, doublereal +/* Subroutine */ void dtfttr_(char *transr, char *uplo, integer *n, doublereal *arf, doublereal *a, integer *lda, integer *info) { /* System generated locals */ @@ -755,7 +755,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTFTTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -764,7 +764,7 @@ f"> */ if (*n == 1) { a[0] = arf[0]; } - return 0; + return; } /* Size of array ARF(0:nt-1) */ @@ -1034,7 +1034,7 @@ f"> */ } - return 0; + return; /* End of DTFTTR */ diff --git a/lapack-netlib/SRC/dtgevc.c b/lapack-netlib/SRC/dtgevc.c index 3f093a4e6c..66b43dacac 100644 --- a/lapack-netlib/SRC/dtgevc.c +++ b/lapack-netlib/SRC/dtgevc.c @@ -812,7 +812,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void dtgevc_(char *side, char *howmny, logical *select, integer *n, doublereal *s, integer *lds, doublereal *p, integer *ldp, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, integer *mm, integer *m, doublereal *work, integer *info) @@ -826,7 +826,7 @@ f"> */ integer ibeg, ieig, iend; doublereal dmin__, temp, xmax, sump[4] /* was [2][2] */, sums[4] /* was [2][2] */; - extern /* Subroutine */ int dlag2_(doublereal *, integer *, doublereal *, + extern /* Subroutine */ void dlag2_(doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal cim2a, cim2b, cre2a, cre2b, temp2, bdiag[2]; @@ -836,7 +836,7 @@ f"> */ integer iside; doublereal sbeta; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); logical il2by2; @@ -845,12 +845,12 @@ f"> */ logical compl; doublereal anorm, bnorm; logical compr; - extern /* Subroutine */ int dlaln2_(logical *, integer *, integer *, + extern /* Subroutine */ void dlaln2_(logical *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal * , doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal temp2i; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal temp2r; integer ja; logical ilabad, ilbbad; @@ -865,7 +865,7 @@ f"> */ doublereal bcoefr; integer jw, nw; doublereal salfar, safmin; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal xscale, bignum; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -953,7 +953,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Count the number of eigenvectors to be computed */ @@ -1022,14 +1022,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = im; if (*n == 0) { - return 0; + return; } /* Machine Constants */ @@ -1221,7 +1221,7 @@ f"> */ bcoefi = -bcoefi; if (bcoefi == 0.) { *info = je; - return 0; + return; } /* Scale to avoid over/underflow */ @@ -1632,7 +1632,7 @@ f"> */ temp2, &bcoefi); if (bcoefi == 0.) { *info = je - 1; - return 0; + return; } /* Scale to avoid over/underflow */ @@ -1962,7 +1962,7 @@ f"> */ } } - return 0; + return; /* End of DTGEVC */ diff --git a/lapack-netlib/SRC/dtgex2.c b/lapack-netlib/SRC/dtgex2.c index c8b1361031..f4a077d355 100644 --- a/lapack-netlib/SRC/dtgex2.c +++ b/lapack-netlib/SRC/dtgex2.c @@ -740,7 +740,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgex2_(logical *wantq, logical *wantz, integer *n, +/* Subroutine */ void dtgex2_(logical *wantq, logical *wantz, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * q, integer *ldq, doublereal *z__, integer *ldz, integer *j1, integer * n1, integer *n2, doublereal *work, integer *lwork, integer *info) @@ -755,23 +755,23 @@ f"> */ doublereal ddum; integer idum; doublereal taul[4], dsum; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); doublereal taur[4], scpy[16] /* was [4][4] */, tcpy[16] /* was [4][4] */, f, g; integer i__, m; doublereal s[16] /* was [4][4] */, t[16] /* was [4][4] */; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal scale, bqra21, brqa21; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal licop[16] /* was [4][4] */; integer linfo; doublereal ircop[16] /* was [4][4] */, dnorm; integer iwork[4]; - extern /* Subroutine */ int dlagv2_(doublereal *, integer *, doublereal *, + extern /* Subroutine */ void dlagv2_(doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal * , doublereal *, doublereal *, doublereal *), dgeqr2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, @@ -786,7 +786,7 @@ f"> */ integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal be[2], ai[2]; - extern /* Subroutine */ int dtgsy2_(char *, integer *, integer *, integer + extern /* Subroutine */ void dtgsy2_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, @@ -794,7 +794,7 @@ f"> */ doublereal ar[2], sa, sb, li[16] /* was [4][4] */; extern doublereal dlamch_(char *); doublereal dscale, ir[16] /* was [4][4] */, ss, ws; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlaset_(char *, integer *, integer *, doublereal *, @@ -836,10 +836,10 @@ f"> */ /* Quick return if possible */ if (*n <= 1 || *n1 <= 0 || *n2 <= 0) { - return 0; + return; } if (*n1 > *n || *j1 + *n1 > *n) { - return 0; + return; } m = *n1 + *n2; /* Computing MAX */ @@ -849,7 +849,7 @@ f"> */ /* Computing MAX */ i__1 = 1, i__2 = *n * m, i__1 = f2cmax(i__1,i__2), i__2 = m * m << 1; work[1] = (doublereal) f2cmax(i__1,i__2); - return 0; + return; } weak = FALSE_; @@ -988,7 +988,7 @@ f"> */ /* Exit with INFO = 0 if swap was successfully performed. */ - return 0; + return; } else { @@ -1286,7 +1286,7 @@ f"> */ /* Exit with INFO = 0 if swap was successfully performed. */ - return 0; + return; } @@ -1295,7 +1295,7 @@ f"> */ L70: *info = 1; - return 0; + return; /* End of DTGEX2 */ diff --git a/lapack-netlib/SRC/dtgexc.c b/lapack-netlib/SRC/dtgexc.c index caa0acfc5c..bc0765a6fb 100644 --- a/lapack-netlib/SRC/dtgexc.c +++ b/lapack-netlib/SRC/dtgexc.c @@ -733,7 +733,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgexc_(logical *wantq, logical *wantz, integer *n, +/* Subroutine */ void dtgexc_(logical *wantq, logical *wantz, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * q, integer *ldq, doublereal *z__, integer *ldz, integer *ifst, integer *ilst, doublereal *work, integer *lwork, integer *info) @@ -744,10 +744,11 @@ f"> */ /* Local variables */ integer here, lwmin; - extern /* Subroutine */ int dtgex2_(logical *, logical *, integer *, + extern /* Subroutine */ void dtgex2_(logical *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer - *, doublereal *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer nbnext; logical lquery; integer nbf, nbl; @@ -814,15 +815,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGEXC", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Determine the first row of the specified block and find out */ @@ -855,7 +856,7 @@ f"> */ } } if (*ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -890,7 +891,7 @@ f"> */ &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here += nbnext; @@ -919,7 +920,7 @@ f"> */ nbnext, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -930,7 +931,7 @@ f"> */ &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; @@ -950,7 +951,7 @@ f"> */ here, &c__1, &nbnext, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here += 2; } else { @@ -962,7 +963,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; dtgex2_(wantq, wantz, n, &a[a_offset], lda, &b[b_offset], @@ -970,7 +971,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; } @@ -1003,7 +1004,7 @@ f"> */ &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here -= nbnext; @@ -1032,7 +1033,7 @@ f"> */ c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -1043,7 +1044,7 @@ f"> */ nbnext, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; } else { @@ -1063,7 +1064,7 @@ f"> */ i__1, &c__2, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here += -2; } else { @@ -1075,7 +1076,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; dtgex2_(wantq, wantz, n, &a[a_offset], lda, &b[b_offset], @@ -1083,7 +1084,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; } @@ -1095,7 +1096,7 @@ f"> */ } *ilst = here; work[1] = (doublereal) lwmin; - return 0; + return; /* End of DTGEXC */ diff --git a/lapack-netlib/SRC/dtgsen.c b/lapack-netlib/SRC/dtgsen.c index b65a290f69..1d2e658e25 100644 --- a/lapack-netlib/SRC/dtgsen.c +++ b/lapack-netlib/SRC/dtgsen.c @@ -964,7 +964,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgsen_(integer *ijob, logical *wantq, logical *wantz, +/* Subroutine */ void dtgsen_(integer *ijob, logical *wantq, logical *wantz, logical *select, integer *n, doublereal *a, integer *lda, doublereal * b, integer *ldb, doublereal *alphar, doublereal *alphai, doublereal * beta, doublereal *q, integer *ldq, doublereal *z__, integer *ldz, @@ -983,7 +983,7 @@ f"> */ integer ierr; doublereal dsum; logical swap; - extern /* Subroutine */ int dlag2_(doublereal *, integer *, doublereal *, + extern /* Subroutine */ void dlag2_(doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer i__, k, isave[3]; @@ -991,7 +991,7 @@ f"> */ integer lwmin; logical wantp; integer n1, n2; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); logical wantd1, wantd2; integer kk; @@ -999,15 +999,16 @@ f"> */ doublereal dscale; integer ks; doublereal rdscal; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dtgexc_(logical *, logical *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dtgexc_(logical *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *), dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); integer liwmin; - extern /* Subroutine */ int dtgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void dtgsyl_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, @@ -1072,7 +1073,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGSEN", &i__1, (ftnlen)6); - return 0; + return; } /* Get machine constants */ @@ -1154,9 +1155,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -1451,7 +1452,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DTGSEN */ diff --git a/lapack-netlib/SRC/dtgsja.c b/lapack-netlib/SRC/dtgsja.c index 4902e29f88..158ee2805e 100644 --- a/lapack-netlib/SRC/dtgsja.c +++ b/lapack-netlib/SRC/dtgsja.c @@ -893,7 +893,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgsja_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void dtgsja_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, integer *k, integer *l, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *tola, doublereal *tolb, doublereal *alpha, doublereal *beta, doublereal *u, @@ -906,14 +906,14 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer kcallmycycle, i__, j; doublereal gamma; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); doublereal a1; logical initq; @@ -922,15 +922,15 @@ f"> */ doublereal b2, b3; logical wantu, wantv; doublereal error, ssmin; - extern /* Subroutine */ int dlags2_(logical *, doublereal *, doublereal *, + extern /* Subroutine */ void dlags2_(logical *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlapll_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *), dlartg_( doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlaset_(char *, integer *, integer *, doublereal *, - doublereal *, doublereal *, integer *), xerbla_(char *, - integer *, ftnlen); + doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); // extern integer myhuge_(doublereal *); doublereal csq, csu, csv, snq, rwk, snu, snv; @@ -1006,7 +1006,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGSJA", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize U, V and Q, if necessary */ @@ -1245,7 +1245,7 @@ f"> */ L100: *ncallmycycle = kcallmycycle; - return 0; + return; /* End of DTGSJA */ diff --git a/lapack-netlib/SRC/dtgsna.c b/lapack-netlib/SRC/dtgsna.c index 1470f1f666..010b0a8909 100644 --- a/lapack-netlib/SRC/dtgsna.c +++ b/lapack-netlib/SRC/dtgsna.c @@ -897,7 +897,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void dtgsna_(char *job, char *howmny, logical *select, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *vl, integer *ldvl, doublereal *vr, integer *ldvr, doublereal *s, doublereal *dif, integer *mm, integer *m, doublereal * @@ -919,7 +919,7 @@ f"> */ doublereal lnrm; integer ilst; doublereal rnrm; - extern /* Subroutine */ int dlag2_(doublereal *, integer *, doublereal *, + extern /* Subroutine */ void dlag2_(doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); extern doublereal dnrm2_(integer *, doublereal *, integer *); @@ -927,7 +927,7 @@ f"> */ integer i__, k; doublereal scale; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal uhavi, uhbvi, tmpii, c1, c2; @@ -943,15 +943,16 @@ f"> */ doublereal alphai; integer iz; doublereal alphar; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), dtgexc_(logical *, logical *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dtgexc_(logical *, logical *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); logical wantbh, wantdf, somcon; doublereal alprqt; - extern /* Subroutine */ int dtgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void dtgsyl_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, @@ -1071,15 +1072,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGSNA", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1289,7 +1290,7 @@ f"> */ ; } work[1] = (doublereal) lwmin; - return 0; + return; /* End of DTGSNA */ diff --git a/lapack-netlib/SRC/dtgsy2.c b/lapack-netlib/SRC/dtgsy2.c index 040e9d1bb5..fc2f271aa1 100644 --- a/lapack-netlib/SRC/dtgsy2.c +++ b/lapack-netlib/SRC/dtgsy2.c @@ -789,7 +789,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int dtgsy2_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void dtgsy2_(char *trans, integer *ijob, integer *m, integer * n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *c__, integer *ldc, doublereal *d__, integer *ldd, doublereal *e, integer *lde, doublereal *f, integer *ldf, doublereal * @@ -801,18 +801,18 @@ f"> */ d_offset, e_dim1, e_offset, f_dim1, f_offset, i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer ierr, zdim, ipiv[8], jpiv[8], i__, j, k, p, q; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal z__[64] /* was [8][8] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer @@ -821,13 +821,13 @@ f"> */ integer *, integer *, doublereal *), dgetc2_(integer *, doublereal *, integer *, integer *, integer *, integer *); integer ie, je, mb, nb, ii, jj, is, js; - extern /* Subroutine */ int dlatdf_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlatdf_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *); doublereal scaloc; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; doublereal rhs[8]; integer isp1, jsp1; @@ -900,7 +900,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGSY2", &i__1, (ftnlen)6); - return 0; + return; } /* Determine block structure of A */ @@ -1704,7 +1704,7 @@ f"> */ } } - return 0; + return; /* End of DTGSY2 */ diff --git a/lapack-netlib/SRC/dtgsyl.c b/lapack-netlib/SRC/dtgsyl.c index 0b5ce9f765..3493e716fa 100644 --- a/lapack-netlib/SRC/dtgsyl.c +++ b/lapack-netlib/SRC/dtgsyl.c @@ -816,7 +816,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtgsyl_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void dtgsyl_(char *trans, integer *ijob, integer *m, integer * n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal *c__, integer *ldc, doublereal *d__, integer *ldd, doublereal *e, integer *lde, doublereal *f, integer *ldf, doublereal * @@ -831,14 +831,14 @@ f"> */ /* Local variables */ doublereal dsum; integer ppqq, i__, j, k, p, q; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemm_(char *, char *, integer *, integer *, integer * , doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer ifunc, linfo, lwmin; doublereal scale2; - extern /* Subroutine */ int dtgsy2_(char *, integer *, integer *, integer + extern /* Subroutine */ void dtgsy2_(char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, @@ -847,7 +847,7 @@ f"> */ doublereal dscale; integer is, js, pq; doublereal scaloc; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *), dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); @@ -949,9 +949,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTGSYL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -963,7 +963,7 @@ f"> */ *dif = 0.; } } - return 0; + return; } /* Determine optimal block sizes MB and NB */ @@ -1026,7 +1026,7 @@ f"> */ /* L30: */ } - return 0; + return; } /* Determine block structure of A */ @@ -1293,7 +1293,7 @@ f"> */ work[1] = (doublereal) lwmin; - return 0; + return; /* End of DTGSYL */ diff --git a/lapack-netlib/SRC/dtpcon.c b/lapack-netlib/SRC/dtpcon.c index 3032ece514..7beee5211d 100644 --- a/lapack-netlib/SRC/dtpcon.c +++ b/lapack-netlib/SRC/dtpcon.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtpcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void dtpcon_(char *norm, char *uplo, char *diag, integer *n, doublereal *ap, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -655,12 +655,12 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); doublereal anorm; logical upper; doublereal xnorm; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -669,7 +669,7 @@ f"> */ extern doublereal dlantp_(char *, char *, char *, integer *, doublereal *, doublereal *); doublereal ainvnm; - extern /* Subroutine */ int dlatps_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatps_(char *, char *, char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *); logical onenrm; @@ -712,14 +712,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.; - return 0; + return; } *rcond = 0.; @@ -782,7 +782,7 @@ f"> */ } L20: - return 0; + return; /* End of DTPCON */ diff --git a/lapack-netlib/SRC/dtplqt.c b/lapack-netlib/SRC/dtplqt.c index 5b5e0de6aa..0c3d1b1c93 100644 --- a/lapack-netlib/SRC/dtplqt.c +++ b/lapack-netlib/SRC/dtplqt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtplqt_(integer *m, integer *n, integer *l, integer *mb, +/* Subroutine */ void dtplqt_(integer *m, integer *n, integer *l, integer *mb, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * t, integer *ldt, doublereal *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dtprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dtprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -759,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPLQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *m; @@ -799,7 +800,7 @@ f"> */ a_dim1], lda, &b[i__ + ib + b_dim1], ldb, &work[1], &i__4); } } - return 0; + return; /* End of DTPLQT */ diff --git a/lapack-netlib/SRC/dtplqt2.c b/lapack-netlib/SRC/dtplqt2.c index 52b7bc45fc..cac1311db6 100644 --- a/lapack-netlib/SRC/dtplqt2.c +++ b/lapack-netlib/SRC/dtplqt2.c @@ -693,7 +693,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtplqt2_(integer *m, integer *n, integer *l, doublereal * +/* Subroutine */ void dtplqt2_(integer *m, integer *n, integer *l, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *t, integer * ldt, integer *info) { @@ -702,19 +702,20 @@ is composed of a triangular block and a pentagonal block, using the compact WY r i__3; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, j, p; doublereal alpha; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); integer mp, np; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, - integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, + integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.1) -- */ @@ -757,13 +758,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("DTPLQT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *m; @@ -863,6 +864,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of DTPLQT2 */ - return 0; + return; } /* dtplqt2_ */ diff --git a/lapack-netlib/SRC/dtpmlqt.c b/lapack-netlib/SRC/dtpmlqt.c index dfa0dad79a..52ee6799ea 100644 --- a/lapack-netlib/SRC/dtpmlqt.c +++ b/lapack-netlib/SRC/dtpmlqt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtpmlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dtpmlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *mb, doublereal *v, integer *ldv, doublereal *t, integer *ldt, doublereal *a, integer *lda, doublereal * b, integer *ldb, doublereal *work, integer *info) @@ -740,7 +740,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, lb, nb, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dtprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dtprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -812,12 +813,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DTPMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -906,7 +907,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DTPMLQT */ diff --git a/lapack-netlib/SRC/dtpmqrt.c b/lapack-netlib/SRC/dtpmqrt.c index 1f3a511ae6..78b7a4e4f9 100644 --- a/lapack-netlib/SRC/dtpmqrt.c +++ b/lapack-netlib/SRC/dtpmqrt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtpmqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void dtpmqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *nb, doublereal *v, integer *ldv, doublereal *t, integer *ldt, doublereal *a, integer *lda, doublereal * b, integer *ldb, doublereal *work, integer *info) @@ -740,7 +740,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, lb, mb, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dtprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dtprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -814,12 +815,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("DTPMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -908,7 +909,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DTPMQRT */ diff --git a/lapack-netlib/SRC/dtpqrt.c b/lapack-netlib/SRC/dtpqrt.c index ab4ea2367e..3d44f2b166 100644 --- a/lapack-netlib/SRC/dtpqrt.c +++ b/lapack-netlib/SRC/dtpqrt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtpqrt_(integer *m, integer *n, integer *l, integer *nb, +/* Subroutine */ void dtpqrt_(integer *m, integer *n, integer *l, integer *nb, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * t, integer *ldt, doublereal *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, mb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dtprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dtprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, @@ -759,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -799,7 +800,7 @@ f"> */ , &ib); } } - return 0; + return; /* End of DTPQRT */ diff --git a/lapack-netlib/SRC/dtpqrt2.c b/lapack-netlib/SRC/dtpqrt2.c index dbe0488d4e..d82e0a7d52 100644 --- a/lapack-netlib/SRC/dtpqrt2.c +++ b/lapack-netlib/SRC/dtpqrt2.c @@ -690,7 +690,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtpqrt2_(integer *m, integer *n, integer *l, doublereal * +/* Subroutine */ void dtpqrt2_(integer *m, integer *n, integer *l, doublereal * a, integer *lda, doublereal *b, integer *ldb, doublereal *t, integer * ldt, integer *info) { @@ -699,19 +699,20 @@ is composed of a triangular block and a pentagonal block, using the compact WY r i__3; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, j, p; doublereal alpha; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *); integer mp, np; - extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *, - integer *, doublereal *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlarfg_(integer *, doublereal *, doublereal *, + integer *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -754,13 +755,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("DTPQRT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *n; @@ -854,6 +855,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of DTPQRT2 */ - return 0; + return; } /* dtpqrt2_ */ diff --git a/lapack-netlib/SRC/dtprfb.c b/lapack-netlib/SRC/dtprfb.c index e28313a273..28c152d9e5 100644 --- a/lapack-netlib/SRC/dtprfb.c +++ b/lapack-netlib/SRC/dtprfb.c @@ -766,7 +766,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtprfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void dtprfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, doublereal *v, integer *ldv, doublereal *t, integer *ldt, doublereal *a, integer * lda, doublereal *b, integer *ldb, doublereal *work, integer *ldwork) @@ -778,12 +778,12 @@ f"> */ /* Local variables */ logical left, backward; integer i__, j; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); logical right; - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer kp, mp, np; @@ -820,7 +820,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0 || *k <= 0 || *l < 0) { - return 0; + return; } if (lsame_(storev, "C")) { @@ -1471,7 +1471,7 @@ f"> */ } - return 0; + return; /* End of DTPRFB */ diff --git a/lapack-netlib/SRC/dtprfb.f b/lapack-netlib/SRC/dtprfb.f index a3fc7d6c63..c015075b3f 100644 --- a/lapack-netlib/SRC/dtprfb.f +++ b/lapack-netlib/SRC/dtprfb.f @@ -1,4 +1,4 @@ -*> \brief \b DTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b DTPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/dtprfs.c b/lapack-netlib/SRC/dtprfs.c index 1cf9487295..3d89639f4e 100644 --- a/lapack-netlib/SRC/dtprfs.c +++ b/lapack-netlib/SRC/dtprfs.c @@ -688,7 +688,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtprfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void dtprfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublereal *ap, doublereal *b, integer *ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) @@ -704,12 +704,12 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -774,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1061,7 +1061,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of DTPRFS */ diff --git a/lapack-netlib/SRC/dtptri.c b/lapack-netlib/SRC/dtptri.c index 272a3addde..d2bf675f0b 100644 --- a/lapack-netlib/SRC/dtptri.c +++ b/lapack-netlib/SRC/dtptri.c @@ -630,7 +630,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtptri_(char *uplo, char *diag, integer *n, doublereal * +/* Subroutine */ void dtptri_(char *uplo, char *diag, integer *n, doublereal * ap, integer *info) { /* System generated locals */ @@ -638,10 +638,10 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpmv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); logical upper; integer jc, jj; @@ -679,7 +679,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Check for singularity if non-unit. */ @@ -691,7 +691,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { jj += *info; if (ap[jj] == 0.) { - return 0; + return; } /* L10: */ } @@ -700,7 +700,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ap[jj] == 0.) { - return 0; + return; } jj = jj + *n - *info + 1; /* L20: */ @@ -762,7 +762,7 @@ f"> */ } } - return 0; + return; /* End of DTPTRI */ diff --git a/lapack-netlib/SRC/dtptrs.c b/lapack-netlib/SRC/dtptrs.c index 06c0eee9dd..b9485e5337 100644 --- a/lapack-netlib/SRC/dtptrs.c +++ b/lapack-netlib/SRC/dtptrs.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtptrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void dtptrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublereal *ap, doublereal *b, integer *ldb, integer * info) { @@ -654,7 +654,7 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int dtpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtpsv_(char *, char *, char *, integer *, doublereal *, doublereal *, integer *); integer jc; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -699,13 +699,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -716,7 +716,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ap[jc + *info - 1] == 0.) { - return 0; + return; } jc += *info; /* L10: */ @@ -726,7 +726,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ap[jc] == 0.) { - return 0; + return; } jc = jc + *n - *info + 1; /* L20: */ @@ -743,7 +743,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of DTPTRS */ diff --git a/lapack-netlib/SRC/dtpttf.c b/lapack-netlib/SRC/dtpttf.c index d7d46c4040..f58742658a 100644 --- a/lapack-netlib/SRC/dtpttf.c +++ b/lapack-netlib/SRC/dtpttf.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtpttf_(char *transr, char *uplo, integer *n, doublereal +/* Subroutine */ void dtpttf_(char *transr, char *uplo, integer *n, doublereal *ap, doublereal *arf, integer *info) { /* System generated locals */ @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -753,7 +753,7 @@ f"> */ } else { arf[0] = ap[0]; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1041,7 +1041,7 @@ f"> */ } - return 0; + return; /* End of DTPTTF */ diff --git a/lapack-netlib/SRC/dtpttr.c b/lapack-netlib/SRC/dtpttr.c index 5c9abeee27..e62a648cf2 100644 --- a/lapack-netlib/SRC/dtpttr.c +++ b/lapack-netlib/SRC/dtpttr.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtpttr_(char *uplo, integer *n, doublereal *ap, +/* Subroutine */ void dtpttr_(char *uplo, integer *n, doublereal *ap, doublereal *a, integer *lda, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTPTTR", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -683,7 +683,7 @@ f"> */ } - return 0; + return; /* End of DTPTTR */ diff --git a/lapack-netlib/SRC/dtrcon.c b/lapack-netlib/SRC/dtrcon.c index 95742345a6..bda4ee17af 100644 --- a/lapack-netlib/SRC/dtrcon.c +++ b/lapack-netlib/SRC/dtrcon.c @@ -649,7 +649,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void dtrcon_(char *norm, char *uplo, char *diag, integer *n, doublereal *a, integer *lda, doublereal *rcond, doublereal *work, integer *iwork, integer *info) { @@ -662,12 +662,12 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int drscl_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void drscl_(integer *, doublereal *, doublereal *, integer *); doublereal anorm; logical upper; doublereal xnorm; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -676,7 +676,7 @@ f"> */ extern doublereal dlantr_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); doublereal ainvnm; - extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void dlatrs_(char *, char *, char *, char *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); logical onenrm; @@ -723,14 +723,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.; - return 0; + return; } *rcond = 0.; @@ -793,7 +793,7 @@ f"> */ } L20: - return 0; + return; /* End of DTRCON */ diff --git a/lapack-netlib/SRC/dtrevc.c b/lapack-netlib/SRC/dtrevc.c index 7a99853931..bbaacf89d4 100644 --- a/lapack-netlib/SRC/dtrevc.c +++ b/lapack-netlib/SRC/dtrevc.c @@ -739,7 +739,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtrevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void dtrevc_(char *side, char *howmny, logical *select, integer *n, doublereal *t, integer *ldt, doublereal *vl, integer * ldvl, doublereal *vr, integer *ldvr, integer *mm, integer *m, doublereal *work, integer *info) @@ -760,24 +760,24 @@ f"> */ logical over; doublereal vmax; integer jnxt, i__, j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal scale, x[4] /* was [2][2] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal remax; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical leftv, bothv; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal vcrit; logical somev; integer j1, j2, n2; doublereal xnorm; - extern /* Subroutine */ int dlaln2_(logical *, integer *, integer *, + extern /* Subroutine */ void dlaln2_(logical *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal * , doublereal *, integer *, doublereal *, doublereal *, integer *), @@ -886,13 +886,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTREVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set the constants to control overflow. */ @@ -1795,7 +1795,7 @@ f"> */ } - return 0; + return; /* End of DTREVC */ diff --git a/lapack-netlib/SRC/dtrevc3.c b/lapack-netlib/SRC/dtrevc3.c index 8b29732e95..eb7055fa29 100644 --- a/lapack-netlib/SRC/dtrevc3.c +++ b/lapack-netlib/SRC/dtrevc3.c @@ -757,7 +757,7 @@ static logical c_true = TRUE_; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtrevc3_(char *side, char *howmny, logical *select, +/* Subroutine */ void dtrevc3_(char *side, char *howmny, logical *select, integer *n, doublereal *t, integer *ldt, doublereal *vl, integer * ldvl, doublereal *vr, integer *ldvr, integer *mm, integer *m, doublereal *work, integer *lwork, integer *info) @@ -780,44 +780,44 @@ static logical c_true = TRUE_; logical over; doublereal vmax; integer jnxt, i__, j, k; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal scale, x[4] /* was [2][2] */; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal remax; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical leftv, bothv; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal vcrit; logical somev; integer j1, j2; doublereal xnorm; - extern /* Subroutine */ int dlaln2_(logical *, integer *, integer *, + extern /* Subroutine */ void dlaln2_(logical *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal * , doublereal *, integer *, doublereal *, doublereal *, integer *); integer iscomplex[128]; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer nb, ii, ki; extern doublereal dlamch_(char *); integer ip, is, iv; doublereal wi; extern integer idamax_(integer *, doublereal *, integer *); doublereal wr; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal bignum; logical rightv; @@ -934,15 +934,15 @@ static logical c_true = TRUE_; if (*info != 0) { i__2 = -(*info); xerbla_("DTREVC3", &i__2, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Use blocked version of back-transformation if sufficient workspace. */ @@ -2055,7 +2055,7 @@ static logical c_true = TRUE_; } } - return 0; + return; /* End of DTREVC3 */ diff --git a/lapack-netlib/SRC/dtrexc.c b/lapack-netlib/SRC/dtrexc.c index 8077d94688..6dd00b9070 100644 --- a/lapack-netlib/SRC/dtrexc.c +++ b/lapack-netlib/SRC/dtrexc.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrexc_(char *compq, integer *n, doublereal *t, integer * +/* Subroutine */ void dtrexc_(char *compq, integer *n, doublereal *t, integer * ldt, doublereal *q, integer *ldq, integer *ifst, integer *ilst, doublereal *work, integer *info) { @@ -672,9 +672,10 @@ f"> */ integer here; extern logical lsame_(char *, char *); logical wantq; - extern /* Subroutine */ int dlaexc_(logical *, integer *, doublereal *, + extern /* Subroutine */ void dlaexc_(logical *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer - *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer nbnext, nbf, nbl; @@ -717,13 +718,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTREXC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Determine the first row of specified block */ @@ -757,7 +758,7 @@ f"> */ } if (*ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -791,7 +792,7 @@ f"> */ nbf, &nbnext, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here += nbnext; @@ -819,7 +820,7 @@ f"> */ c__1, &nbnext, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -843,7 +844,7 @@ f"> */ here, &c__1, &nbnext, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here += 2; } else { @@ -885,7 +886,7 @@ f"> */ nbnext, &nbf, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here -= nbnext; @@ -913,7 +914,7 @@ f"> */ nbnext, &c__1, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -938,7 +939,7 @@ f"> */ i__1, &c__2, &c__1, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here += -2; } else { @@ -960,7 +961,7 @@ f"> */ } *ilst = here; - return 0; + return; /* End of DTREXC */ diff --git a/lapack-netlib/SRC/dtrrfs.c b/lapack-netlib/SRC/dtrrfs.c index 92d693e920..a42ee25ffa 100644 --- a/lapack-netlib/SRC/dtrrfs.c +++ b/lapack-netlib/SRC/dtrrfs.c @@ -695,7 +695,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void dtrrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer * ldb, doublereal *x, integer *ldx, doublereal *ferr, doublereal *berr, doublereal *work, integer *iwork, integer *info) @@ -712,11 +712,11 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrmv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dtrsv_(char *, char *, char *, integer *, doublereal *, integer *, doublereal *, integer *), dlacn2_(integer *, doublereal *, doublereal *, integer *, @@ -785,7 +785,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -797,7 +797,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1062,7 +1062,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of DTRRFS */ diff --git a/lapack-netlib/SRC/dtrsen.c b/lapack-netlib/SRC/dtrsen.c index 41bf17be48..17d3915cab 100644 --- a/lapack-netlib/SRC/dtrsen.c +++ b/lapack-netlib/SRC/dtrsen.c @@ -825,7 +825,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtrsen_(char *job, char *compq, logical *select, integer +/* Subroutine */ void dtrsen_(char *job, char *compq, logical *select, integer *n, doublereal *t, integer *ldt, doublereal *q, integer *ldq, doublereal *wr, doublereal *wi, integer *m, doublereal *s, doublereal *sep, doublereal *work, integer *lwork, integer *iwork, integer * @@ -847,22 +847,22 @@ f"> */ logical wantq, wants; doublereal rnorm; integer n1, n2; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); integer kk; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer nn, ks; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical wantbh; - extern /* Subroutine */ int dtrexc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dtrexc_(char *, integer *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *); integer liwmin; logical wantsp, lquery; - extern /* Subroutine */ int dtrsyl_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dtrsyl_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); doublereal est; @@ -974,9 +974,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -1113,7 +1113,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of DTRSEN */ diff --git a/lapack-netlib/SRC/dtrsna.c b/lapack-netlib/SRC/dtrsna.c index 6be0eb8e45..4bcf3e344b 100644 --- a/lapack-netlib/SRC/dtrsna.c +++ b/lapack-netlib/SRC/dtrsna.c @@ -778,7 +778,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtrsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void dtrsna_(char *job, char *howmny, logical *select, integer *n, doublereal *t, integer *ldt, doublereal *vl, integer * ldvl, doublereal *vr, integer *ldvr, doublereal *s, doublereal *sep, integer *mm, integer *m, doublereal *work, integer *ldwork, integer * @@ -810,20 +810,20 @@ f"> */ logical wants; doublereal dummy[1]; integer n2; - extern /* Subroutine */ int dlacn2_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlacn2_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); extern doublereal dlapy2_(doublereal *, doublereal *); - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal cs; extern doublereal dlamch_(char *); integer nn, ks; doublereal sn, mu; - extern /* Subroutine */ int dlacpy_(char *, integer *, integer *, - doublereal *, integer *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dlacpy_(char *, integer *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; logical wantbh; - extern /* Subroutine */ int dlaqtr_(logical *, logical *, integer *, + extern /* Subroutine */ void dlaqtr_(logical *, logical *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dtrexc_(char *, integer * , doublereal *, integer *, doublereal *, integer *, integer *, @@ -928,19 +928,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRSNA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (somcon) { if (! select[1]) { - return 0; + return; } } if (wants) { @@ -949,7 +949,7 @@ f"> */ if (wantsp) { sep[1] = (d__1 = t[t_dim1 + 1], abs(d__1)); } - return 0; + return; } /* Get machine constants */ @@ -1186,7 +1186,7 @@ f"> */ L60: ; } - return 0; + return; /* End of DTRSNA */ diff --git a/lapack-netlib/SRC/dtrsyl.c b/lapack-netlib/SRC/dtrsyl.c index d8a4b98760..a928346829 100644 --- a/lapack-netlib/SRC/dtrsyl.c +++ b/lapack-netlib/SRC/dtrsyl.c @@ -681,7 +681,7 @@ f"> */ /* > \ingroup doubleSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrsyl_(char *trana, char *tranb, integer *isgn, integer +/* Subroutine */ void dtrsyl_(char *trana, char *tranb, integer *isgn, integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, integer * ldb, doublereal *c__, integer *ldc, doublereal *scale, integer *info) { @@ -696,13 +696,13 @@ f"> */ integer ierr; doublereal smin, suml, sumr; integer j, k, l; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal x[4] /* was [2][2] */; extern logical lsame_(char *, char *); integer knext, lnext, k1, k2, l1, l2; doublereal xnorm; - extern /* Subroutine */ int dlaln2_(logical *, integer *, integer *, + extern /* Subroutine */ void dlaln2_(logical *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal * , doublereal *, integer *, doublereal *, doublereal *, integer *), @@ -711,7 +711,7 @@ f"> */ integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); doublereal a11, db; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *), dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); doublereal scaloc; @@ -770,14 +770,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRSYL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *scale = 1.; if (*m == 0 || *n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -1879,7 +1879,7 @@ f"> */ } - return 0; + return; /* End of DTRSYL */ diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c new file mode 100644 index 0000000000..187226281e --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.c @@ -0,0 +1,2060 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b DTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > DTRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by DHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is DOUBLE PRECISION array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is DOUBLE PRECISION array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is DOUBLE PRECISION array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void dtrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, + integer *ldb, doublereal *c__, integer *ldc, doublereal *scale, + integer *iwork, integer *liwork, doublereal *swork, integer *ldswork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3; + + /* Local variables */ + doublereal scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, + integer *), dgemm_(char *, char *, integer *, integer *, integer * + , doublereal *, doublereal *, integer *, doublereal *, integer *, + doublereal *, doublereal *, integer *); + extern logical lsame_(char *, char *); + integer iinfo, i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(doublereal *); + integer l2, nb, pc, jj, ll; + extern doublereal dlamch_(char *), dlange_(char *, integer *, + integer *, doublereal *, integer *, doublereal *); + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublereal *, + integer *, integer *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + doublereal bignum; + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ void dtrsyl_(char *, char *, integer *, integer *, + integer *, doublereal *, integer *, doublereal *, integer *, + doublereal *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "DTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DTRSYL3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + dtrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0. && a[l + 1 + l * a_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0. && b[l + 1 + l * b_dim1] != 0.) + { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = dlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = dlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = dlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = dlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + dscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + dgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = dlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + dscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "N", &i__4, &i__5, &i__6, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = dlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + dscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + dgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + dtrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = dlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = dlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + d__1 = -sgn; + dgemm_("N", "T", &i__2, &i__3, &i__4, &d__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + d__2 = scal, d__3 = (d__1 = c__[k + l * c_dim1], abs(d__1)); + scal = f2cmax(d__2,d__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + dlascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return; + +/* End of DTRSYL3 */ + +} /* dtrsyl3_ */ + diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f new file mode 100644 index 0000000000..c44ec38087 --- /dev/null +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -0,0 +1,1241 @@ +*> \brief \b DTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> DTRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by DHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLANGE, DLAMCH, DLARMM + EXTERNAL DLANGE, DLAMCH, DLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + SWORK( K, L ) = SCALOC * SWORK( K, L ) + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of DTRSYL3 +* + END diff --git a/lapack-netlib/SRC/dtrti2.c b/lapack-netlib/SRC/dtrti2.c index 0f756a6220..47eb0d889c 100644 --- a/lapack-netlib/SRC/dtrti2.c +++ b/lapack-netlib/SRC/dtrti2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrti2_(char *uplo, char *diag, integer *n, doublereal * +/* Subroutine */ void dtrti2_(char *uplo, char *diag, integer *n, doublereal * a, integer *lda, integer *info) { /* System generated locals */ @@ -631,12 +631,13 @@ f"> */ /* Local variables */ integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int dtrmv_(char *, char *, char *, integer *, - doublereal *, integer *, doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void dtrmv_(char *, char *, char *, integer *, + doublereal *, integer *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; doublereal ajj; @@ -673,7 +674,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRTI2", &i__1, (ftnlen)6); - return 0; + return; } if (upper) { @@ -723,7 +724,7 @@ f"> */ } } - return 0; + return; /* End of DTRTI2 */ diff --git a/lapack-netlib/SRC/dtrtri.c b/lapack-netlib/SRC/dtrtri.c index e7b353fde2..1f7191472b 100644 --- a/lapack-netlib/SRC/dtrtri.c +++ b/lapack-netlib/SRC/dtrtri.c @@ -626,7 +626,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrtri_(char *uplo, char *diag, integer *n, doublereal * +/* Subroutine */ void dtrtri_(char *uplo, char *diag, integer *n, doublereal * a, integer *lda, integer *info) { /* System generated locals */ @@ -637,13 +637,13 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrmm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *), dtrsm_( char *, char *, char *, char *, integer *, integer *, doublereal * , doublereal *, integer *, doublereal *, integer *); logical upper; - extern /* Subroutine */ int dtrti2_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dtrti2_(char *, char *, integer *, doublereal *, integer *, integer *); integer jb, nb, nn; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -684,13 +684,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity if non-unit. */ @@ -699,7 +699,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (a[*info + *info * a_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -781,7 +781,7 @@ f"> */ } } - return 0; + return; /* End of DTRTRI */ diff --git a/lapack-netlib/SRC/dtrtrs.c b/lapack-netlib/SRC/dtrtrs.c index 41e67499bb..804ebe54e9 100644 --- a/lapack-netlib/SRC/dtrtrs.c +++ b/lapack-netlib/SRC/dtrtrs.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void dtrtrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *b, integer * ldb, integer *info) { @@ -661,9 +661,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void dtrsm_(char *, char *, char *, char *, integer *, integer *, doublereal *, doublereal *, integer *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; @@ -708,13 +709,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -723,7 +724,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (a[*info + *info * a_dim1] == 0.) { - return 0; + return; } /* L10: */ } @@ -735,7 +736,7 @@ f"> */ dtrsm_("Left", uplo, trans, diag, n, nrhs, &c_b12, &a[a_offset], lda, &b[ b_offset], ldb); - return 0; + return; /* End of DTRTRS */ diff --git a/lapack-netlib/SRC/dtrttf.c b/lapack-netlib/SRC/dtrttf.c index 79348e9b78..199a8731c3 100644 --- a/lapack-netlib/SRC/dtrttf.c +++ b/lapack-netlib/SRC/dtrttf.c @@ -704,7 +704,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int dtrttf_(char *transr, char *uplo, integer *n, doublereal +/* Subroutine */ void dtrttf_(char *transr, char *uplo, integer *n, doublereal *a, integer *lda, doublereal *arf, integer *info) { /* System generated locals */ @@ -753,7 +753,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -762,7 +762,7 @@ f"> */ if (*n == 1) { arf[0] = a[0]; } - return 0; + return; } /* Size of array ARF(0:nt-1) */ @@ -1032,7 +1032,7 @@ f"> */ } - return 0; + return; /* End of DTRTTF */ diff --git a/lapack-netlib/SRC/dtrttp.c b/lapack-netlib/SRC/dtrttp.c index 6f88cb4f81..fb96db271f 100644 --- a/lapack-netlib/SRC/dtrttp.c +++ b/lapack-netlib/SRC/dtrttp.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup doubleOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int dtrttp_(char *uplo, integer *n, doublereal *a, integer * +/* Subroutine */ void dtrttp_(char *uplo, integer *n, doublereal *a, integer * lda, doublereal *ap, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTRTTP", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -683,7 +683,7 @@ f"> */ } - return 0; + return; /* End of DTRTTP */ diff --git a/lapack-netlib/SRC/dtzrzf.c b/lapack-netlib/SRC/dtzrzf.c index c6f5e625c7..5836f65fce 100644 --- a/lapack-netlib/SRC/dtzrzf.c +++ b/lapack-netlib/SRC/dtzrzf.c @@ -667,7 +667,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dtzrzf_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dtzrzf_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *tau, doublereal *work, integer *lwork, integer *info) { /* System generated locals */ @@ -675,16 +675,17 @@ f"> */ /* Local variables */ integer i__, nbmin, m1, ib, nb, ki, kk, mu, nx; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlarzb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlarzb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int dlarzt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dlarzt_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer lwkmin, ldwork; - extern /* Subroutine */ int dlatrz_(integer *, integer *, integer *, + extern /* Subroutine */ void dlatrz_(integer *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer lwkopt; logical lquery; @@ -743,22 +744,22 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("DTZRZF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { tau[i__] = 0.; /* L10: */ } - return 0; + return; } nbmin = 2; @@ -856,7 +857,7 @@ f"> */ work[1] = (doublereal) lwkopt; - return 0; + return; /* End of DTZRZF */ diff --git a/lapack-netlib/SRC/ieeeck.f b/lapack-netlib/SRC/ieeeck.f index 74065c3b4e..f9f6332ecf 100644 --- a/lapack-netlib/SRC/ieeeck.f +++ b/lapack-netlib/SRC/ieeeck.f @@ -41,7 +41,7 @@ *> \param[in] ISPEC *> \verbatim *> ISPEC is INTEGER -*> Specifies whether to test just for inifinity arithmetic +*> Specifies whether to test just for infinity arithmetic *> or whether to test for infinity and NaN arithmetic. *> = 0: Verify infinity arithmetic only. *> = 1: Verify infinity and NaN arithmetic. diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f index af28503986..a639e0375a 100644 --- a/lapack-netlib/SRC/ilaenv.f +++ b/lapack-netlib/SRC/ilaenv.f @@ -469,6 +469,15 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) ELSE NB = 64 END IF + ELSE IF( C3.EQ.'SYL' ) THEN +* The upper bound is to prevent overly aggressive scaling. + IF( SNAME ) THEN + NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ), + $ 240 ) + ELSE + NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ), + $ 80 ) + END IF END IF ELSE IF( C2.EQ.'LA' ) THEN IF( C3.EQ.'UUM' ) THEN @@ -477,6 +486,12 @@ INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 ) ELSE NB = 64 END IF + ELSE IF( C3.EQ.'TRS' ) THEN + IF( SNAME ) THEN + NB = 32 + ELSE + NB = 32 + END IF END IF ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN IF( C3.EQ.'EBZ' ) THEN diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F index c153eef22b..c701c2be08 100644 --- a/lapack-netlib/SRC/iparam2stage.F +++ b/lapack-netlib/SRC/iparam2stage.F @@ -178,7 +178,8 @@ INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS, * .. * .. External Functions .. INTEGER ILAENV - EXTERNAL ILAENV + LOGICAL LSAME + EXTERNAL ILAENV, LSAME * .. * .. Executable Statements .. * @@ -310,7 +311,7 @@ INTEGER FUNCTION IPARAM2STAGE( ISPEC, NAME, OPTS, * * Will add the VECT OPTION HERE next release VECT = OPTS(1:1) - IF( VECT.EQ.'N' ) THEN + IF( LSAME( VECT, 'N' ) ) THEN LHOUS = MAX( 1, 4*NI ) ELSE * This is not correct, it need to call the ALGO and the stage2 diff --git a/lapack-netlib/SRC/iparam2stage.c b/lapack-netlib/SRC/iparam2stage.c index 1b6c4c8b84..7ba938dcd7 100644 --- a/lapack-netlib/SRC/iparam2stage.c +++ b/lapack-netlib/SRC/iparam2stage.c @@ -717,11 +717,12 @@ integer iparam2stage_(integer *ispec, char *name__, char *opts, integer *ni, ret_val = -1; // s_copy(subnam, name__, (ftnlen)12, name_len); -strncpy(subnam,name__,13); -subnam[13]='\0'; -for (int i=0;i<13;i++) subnam[i]=toupper(subnam[i]); - //fprintf(stderr,"iparam2stage, name__ gelesen #%s#\n",name__); -//fprintf(stderr,"iparam2stage, subnam gelesen #%s#\n",subnam); + strncpy(subnam,name__,13); + subnam[13]='\0'; + { + int i; + for (i=0;i<13;i++) subnam[i]=toupper(subnam[i]); + } #if 0 diff --git a/lapack-netlib/SRC/sbbcsd.c b/lapack-netlib/SRC/sbbcsd.c index 017bcb0db4..198baa59c9 100644 --- a/lapack-netlib/SRC/sbbcsd.c +++ b/lapack-netlib/SRC/sbbcsd.c @@ -844,7 +844,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void sbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, integer *m, integer *p, integer *q, real *theta, real *phi, real *u1, integer *ldu1, real *u2, integer *ldu2, real * v1t, integer *ldv1t, real *v2t, integer *ldv2t, real *b11d, real * @@ -864,17 +864,17 @@ f"> */ real thetamin, thetamax; logical restart11, restart12, restart21, restart22; integer lworkmin, iu1cs, iu2cs; - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; integer iu1sn, iu2sn, lworkopt, i__, j; real r__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer maxit; - extern /* Subroutine */ int slasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void slasr_(char *, char *, char *, integer *, integer *, real *, real *, real *, integer *); real dummy; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real x1, x2, y1, y2; integer iv1tcs, iv2tcs; @@ -884,12 +884,12 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real thresh, tolmul; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery; real b11bulge; logical wantv1t, wantv2t; real b12bulge, b21bulge, b22bulge, eps, tol; - extern /* Subroutine */ int slartgp_(real *, real *, real *, real *, real + extern /* Subroutine */ void slartgp_(real *, real *, real *, real *, real *), slartgs_(real *, real *, real *, real *, real *); @@ -962,7 +962,7 @@ f"> */ if (*info == 0 && *q == 0) { lworkmin = 1; work[1] = (real) lworkmin; - return 0; + return; } /* Compute workspace */ @@ -987,9 +987,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SBBCSD", &i__1,(ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Get machine constants */ @@ -1082,7 +1082,7 @@ f"> */ ++(*info); } } - return 0; + return; } iter = iter + imax - imin; @@ -1793,7 +1793,7 @@ f"> */ } - return 0; + return; /* End of SBBCSD */ diff --git a/lapack-netlib/SRC/sbdsdc.c b/lapack-netlib/SRC/sbdsdc.c index 833bfb321f..ffcf1afda0 100644 --- a/lapack-netlib/SRC/sbdsdc.c +++ b/lapack-netlib/SRC/sbdsdc.c @@ -721,7 +721,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sbdsdc_(char *uplo, char *compq, integer *n, real *d__, +/* Subroutine */ void sbdsdc_(char *uplo, char *compq, integer *n, real *d__, real *e, real *u, integer *ldu, real *vt, integer *ldvt, real *q, integer *iq, real *work, integer *iwork, integer *info) { @@ -735,10 +735,10 @@ f"> */ integer z__; extern logical lsame_(char *, char *); integer poles; - extern /* Subroutine */ int slasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void slasr_(char *, char *, char *, integer *, integer *, real *, real *, real *, integer *); integer iuplo, nsize, start; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), slasd0_(integer *, integer *, real *, real *, real *, integer * , real *, integer *, integer *, integer *, real *, integer *); @@ -747,21 +747,21 @@ f"> */ integer is, iu; real sn; extern real slamch_(char *); - extern /* Subroutine */ int slasda_(integer *, integer *, integer *, + extern /* Subroutine */ void slasda_(integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, - integer *, real *, real *, real *, real *, integer *, integer *), - xerbla_(char *, integer *, ftnlen); + integer *, real *, real *, real *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer givcol; - extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void slasdq_(char *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real * , integer *, real *, integer *, real *, integer *); integer icompq; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slartg_(real *, real *, real * , real *, real *); real orgnrm; @@ -833,13 +833,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SBDSDC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } smlsiz = ilaenv_(&c__9, "SBDSDC", " ", &c__0, &c__0, &c__0, &c__0, ( ftnlen)6, (ftnlen)1); @@ -852,7 +852,7 @@ f"> */ vt[vt_dim1 + 1] = 1.f; } d__[1] = abs(d__[1]); - return 0; + return; } nm1 = *n - 1; @@ -931,7 +931,7 @@ f"> */ orgnrm = slanst_("M", n, &d__[1], &e[1]); if (orgnrm == 0.f) { - return 0; + return; } slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, &ierr); slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1, & @@ -1021,7 +1021,7 @@ f"> */ iwork[1], info); } if (*info != 0) { - return 0; + return; } start = i__ + 1; } @@ -1081,7 +1081,7 @@ f"> */ slasr_("L", "V", "B", n, n, &work[1], &work[*n], &u[u_offset], ldu); } - return 0; + return; /* End of SBDSDC */ diff --git a/lapack-netlib/SRC/sbdsqr.c b/lapack-netlib/SRC/sbdsqr.c index 495b33a4d1..832c43bca0 100644 --- a/lapack-netlib/SRC/sbdsqr.c +++ b/lapack-netlib/SRC/sbdsqr.c @@ -755,7 +755,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sbdsqr_(char *uplo, integer *n, integer *ncvt, integer * +/* Subroutine */ void sbdsqr_(char *uplo, integer *n, integer *ncvt, integer * nru, integer *ncc, real *d__, real *e, real *vt, integer *ldvt, real * u, integer *ldu, real *c__, integer *ldc, real *work, integer *info) { @@ -773,27 +773,27 @@ f"> */ real cosl; integer isub, iter; real unfl, sinl, cosr, smin, smax, sinr; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer iterdivn; - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; real f, g, h__; integer i__, j, m; real r__; extern logical lsame_(char *, char *); real oldcs; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer oldll; real shift, sigmn, oldsn, sminl; - extern /* Subroutine */ int slasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void slasr_(char *, char *, char *, integer *, integer *, real *, real *, real *, integer *); real sigmx; logical lower; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer maxitdivn; - extern /* Subroutine */ int slasq1_(integer *, real *, real *, real *, + extern /* Subroutine */ void slasq1_(integer *, real *, real *, real *, integer *), slasv2_(real *, real *, real *, real *, real *, real * , real *, real *, real *); real cs; @@ -802,7 +802,7 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real sminoa; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ); real thresh; logical rotate; @@ -860,10 +860,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SBDSQR", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } if (*n == 1) { goto L160; @@ -881,7 +881,7 @@ f"> */ /* If INFO equals 2, dqds didn't finish, try to finish */ if (*info != 2) { - return 0; + return; } *info = 0; } @@ -1512,7 +1512,7 @@ f"> */ /* L210: */ } L220: - return 0; + return; /* End of SBDSQR */ diff --git a/lapack-netlib/SRC/sbdsvdx.c b/lapack-netlib/SRC/sbdsvdx.c index 8c1e07a329..bf1b9aa1b5 100644 --- a/lapack-netlib/SRC/sbdsvdx.c +++ b/lapack-netlib/SRC/sbdsvdx.c @@ -742,7 +742,7 @@ static integer c__2 = 2; /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sbdsvdx_(char *uplo, char *jobz, char *range, integer *n, +/* Subroutine */ void sbdsvdx_(char *uplo, char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, integer *ns, real *s, real *z__, integer *ldz, real *work, integer * iwork, integer *info) @@ -765,7 +765,7 @@ static integer c__2 = 2; integer idend, isbeg; extern logical lsame_(char *, char *); integer idtgk, ietgk; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer iltgk, itemp, icolz; logical allsv; integer idptr; @@ -777,13 +777,13 @@ static integer c__2 = 2; logical split, valsv; integer isplt; real ortol, vutgk; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; char rngvx[1]; integer irowu, irowv; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); integer irowz, iifail; real mu; @@ -791,11 +791,11 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); real abstol; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real thresh; integer iiwork; - extern /* Subroutine */ int mecago_(), sstevx_(char *, char *, + extern /* Subroutine */ void mecago_(), sstevx_(char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *, integer *); @@ -866,14 +866,14 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("SBDSVDX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible (N.LE.1) */ *ns = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -890,7 +890,7 @@ static integer c__2 = 2; z__[z_dim1 + 1] = r_sign(&c_b10, &d__[1]); z__[z_dim1 + 2] = 1.f; } - return 0; + return; } abstol = slamch_("Safe Minimum") * 2; @@ -1006,7 +1006,7 @@ static integer c__2 = 2; iltgk, &iltgk, &abstol, ns, &s[1], &z__[z_offset], ldz, &work[ itemp], &iwork[iiwork], &iwork[iifail], info); if (*ns == 0) { - return 0; + return; } else { if (wantz) { i__1 = *n << 1; @@ -1207,7 +1207,7 @@ static integer c__2 = 2; , &iwork[iifail], info); if (*info != 0) { /* Exit with the error code from SSTEVX. */ - return 0; + return; } emin = (r__1 = s[isbeg], abs(r__1)); i__3 = isbeg + nsl - 1; @@ -1261,7 +1261,7 @@ static integer c__2 = 2; z_dim1], &c__2); if (nrmu == 0.f) { *info = (*n << 1) + 1; - return 0; + return; } r__1 = 1.f / nrmu; sscal_(&nru, &r__1, &z__[irowu + (icolz + i__) * @@ -1292,7 +1292,7 @@ static integer c__2 = 2; z_dim1], &c__2); if (nrmv == 0.f) { *info = (*n << 1) + 1; - return 0; + return; } r__1 = -1.f / nrmv; sscal_(&nrv, &r__1, &z__[irowv + (icolz + i__) * @@ -1464,7 +1464,7 @@ static integer c__2 = 2; } } - return 0; + return; /* End of SBDSVDX */ diff --git a/lapack-netlib/SRC/scombssq.c b/lapack-netlib/SRC/scombssq.c index 964dbe88ef..1e207420f4 100644 --- a/lapack-netlib/SRC/scombssq.c +++ b/lapack-netlib/SRC/scombssq.c @@ -568,7 +568,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int scombssq_(real *v1, real *v2) +/* Subroutine */ void scombssq_(real *v1, real *v2) { /* System generated locals */ real r__1; @@ -602,7 +602,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ v1[2] = v2[2] + r__1 * r__1 * v1[2]; v1[1] = v2[1]; } - return 0; + return; /* End of SCOMBSSQ */ diff --git a/lapack-netlib/SRC/sdisna.c b/lapack-netlib/SRC/sdisna.c index 2bcd47cc11..bf12638f16 100644 --- a/lapack-netlib/SRC/sdisna.c +++ b/lapack-netlib/SRC/sdisna.c @@ -626,7 +626,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sdisna_(char *job, integer *m, integer *n, real *d__, +/* Subroutine */ void sdisna_(char *job, integer *m, integer *n, real *d__, real *sep, integer *info) { /* System generated locals */ @@ -707,13 +707,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SDISNA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } /* Compute reciprocal condition numbers */ @@ -768,7 +768,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of SDISNA */ diff --git a/lapack-netlib/SRC/sgbbrd.c b/lapack-netlib/SRC/sgbbrd.c index aed2baa41e..ffa248478a 100644 --- a/lapack-netlib/SRC/sgbbrd.c +++ b/lapack-netlib/SRC/sgbbrd.c @@ -701,7 +701,7 @@ f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbbrd_(char *vect, integer *m, integer *n, integer *ncc, +/* Subroutine */ void sgbbrd_(char *vect, integer *m, integer *n, integer *ncc, integer *kl, integer *ku, real *ab, integer *ldab, real *d__, real * e, real *q, integer *ldq, real *pt, integer *ldpt, real *c__, integer *ldc, real *work, integer *info) @@ -712,7 +712,7 @@ f"> */ /* Local variables */ integer inca; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer i__, j, l; extern logical lsame_(char *, char *); @@ -723,13 +723,14 @@ f"> */ real ra, rb, rc; integer kk, ml, mn, nr, mu; real rs; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaset_( char *, integer *, integer *, real *, real *, real *, integer *), slartg_(real *, real *, real *, real *, real *); integer kb1; - extern /* Subroutine */ int slargv_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slargv_(integer *, real *, integer *, real *, integer *, real *, integer *); integer ml0; - extern /* Subroutine */ int slartv_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slartv_(integer *, real *, integer *, real *, integer *, real *, real *, integer *); logical wantpt; integer mu0, klm, kun, nrt, klu1; @@ -794,7 +795,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBBRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and P**T to the unit matrix, if needed */ @@ -809,7 +810,7 @@ f"> */ /* Quick return if possible. */ if (*m == 0 || *n == 0) { - return 0; + return; } minmn = f2cmin(*m,*n); @@ -1145,7 +1146,7 @@ f"> */ /* L150: */ } } - return 0; + return; /* End of SGBBRD */ diff --git a/lapack-netlib/SRC/sgbcon.c b/lapack-netlib/SRC/sgbcon.c index bb0a508db2..b4dd563ea6 100644 --- a/lapack-netlib/SRC/sgbcon.c +++ b/lapack-netlib/SRC/sgbcon.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbcon_(char *norm, integer *n, integer *kl, integer *ku, +/* Subroutine */ void sgbcon_(char *norm, integer *n, integer *kl, integer *ku, real *ab, integer *ldab, integer *ipiv, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { @@ -674,7 +674,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical lnoti; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *), + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); @@ -683,7 +683,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); real ainvnm; - extern /* Subroutine */ int slatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatbs_(char *, char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *); logical onenrm; @@ -729,7 +729,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -737,9 +737,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -838,7 +838,7 @@ f"> */ } L40: - return 0; + return; /* End of SGBCON */ diff --git a/lapack-netlib/SRC/sgbequ.c b/lapack-netlib/SRC/sgbequ.c index 183ad6ceb0..41c8f41d9f 100644 --- a/lapack-netlib/SRC/sgbequ.c +++ b/lapack-netlib/SRC/sgbequ.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbequ_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void sgbequ_(integer *m, integer *n, integer *kl, integer *ku, real *ab, integer *ldab, real *r__, real *c__, real *rowcnd, real * colcnd, real *amax, integer *info) { @@ -712,7 +712,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -721,7 +721,7 @@ f"> */ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. */ @@ -781,7 +781,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -856,7 +856,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -879,7 +879,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of SGBEQU */ diff --git a/lapack-netlib/SRC/sgbequb.c b/lapack-netlib/SRC/sgbequb.c index f3b59ead74..94dc75c399 100644 --- a/lapack-netlib/SRC/sgbequb.c +++ b/lapack-netlib/SRC/sgbequb.c @@ -668,7 +668,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbequb_(integer *m, integer *n, integer *kl, integer * +/* Subroutine */ void sgbequb_(integer *m, integer *n, integer *kl, integer * ku, real *ab, integer *ldab, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -719,7 +719,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGBEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -728,7 +728,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -797,7 +797,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -875,7 +875,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -898,7 +898,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of SGBEQUB */ diff --git a/lapack-netlib/SRC/sgbrfs.c b/lapack-netlib/SRC/sgbrfs.c index a9f5a7fe04..4d970959d2 100644 --- a/lapack-netlib/SRC/sgbrfs.c +++ b/lapack-netlib/SRC/sgbrfs.c @@ -718,7 +718,7 @@ f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbrfs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void sgbrfs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, integer *ipiv, real *b, integer *ldb, real *x, integer *ldx, real * ferr, real *berr, real *work, integer *iwork, integer *info) @@ -735,11 +735,11 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int sgbmv_(char *, integer *, integer *, integer * + extern /* Subroutine */ void sgbmv_(char *, integer *, integer *, integer * , integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer count; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); @@ -750,7 +750,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran; - extern /* Subroutine */ int sgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbtrs_(char *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); char transt[1]; @@ -813,7 +813,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -825,7 +825,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1034,7 +1034,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SGBRFS */ diff --git a/lapack-netlib/SRC/sgbrfsx.c b/lapack-netlib/SRC/sgbrfsx.c index e3df146f1b..a957a440d2 100644 --- a/lapack-netlib/SRC/sgbrfsx.c +++ b/lapack-netlib/SRC/sgbrfsx.c @@ -848,7 +848,7 @@ static integer c__1 = 1; /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbrfsx_(char *trans, char *equed, integer *n, integer * +/* Subroutine */ void sgbrfsx_(char *trans, char *equed, integer *n, integer * kl, integer *ku, integer *nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, integer *ipiv, real *r__, real *c__, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *berr, integer * @@ -863,7 +863,7 @@ static integer c__1 = 1; /* Local variables */ real illrcond_thresh__; - extern /* Subroutine */ int sla_gbrfsx_extended_(integer *, integer *, + extern /* Subroutine */ void sla_gbrfsx_extended_(integer *, integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *, logical *, real *, real *, integer * , real *, integer *, real *, integer *, real *, real *, real *, @@ -880,9 +880,10 @@ static integer c__1 = 1; integer prec_type__; extern real slangb_(char *, integer *, integer *, integer *, real *, integer *, real *), slamch_(char *); - extern /* Subroutine */ int sgbcon_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbcon_(char *, integer *, integer *, integer *, real *, integer *, integer *, real *, real *, real *, integer * - , integer *), xerbla_(char *, integer *, ftnlen); + , integer *); + extern int xerbla_(char *, integer *, ftnlen); logical colequ, notran, rowequ; integer trans_type__; extern integer ilaprec_(char *); @@ -1007,7 +1008,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SGBRFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -1030,7 +1031,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.f; } } - return 0; + return; } /* Default to failure. */ @@ -1191,7 +1192,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of SGBRFSX */ diff --git a/lapack-netlib/SRC/sgbsv.c b/lapack-netlib/SRC/sgbsv.c index 4aaac8022b..1764eb427c 100644 --- a/lapack-netlib/SRC/sgbsv.c +++ b/lapack-netlib/SRC/sgbsv.c @@ -672,7 +672,7 @@ e driver) */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgbsv_(integer *n, integer *kl, integer *ku, integer * +/* Subroutine */ void sgbsv_(integer *n, integer *kl, integer *ku, integer * nrhs, real *ab, integer *ldab, integer *ipiv, real *b, integer *ldb, integer *info) { @@ -680,7 +680,8 @@ e driver) */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgbtrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgbtrf_( integer *, integer *, integer *, integer *, real *, integer *, integer *, integer *), sgbtrs_(char *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, @@ -725,7 +726,7 @@ e driver) */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBSV ", &i__1, (ftnlen)5); - return 0; + return; } /* Compute the LU factorization of the band matrix A. */ @@ -738,7 +739,7 @@ e driver) */ sgbtrs_("No transpose", n, kl, ku, nrhs, &ab[ab_offset], ldab, &ipiv[ 1], &b[b_offset], ldb, info); } - return 0; + return; /* End of SGBSV */ diff --git a/lapack-netlib/SRC/sgbsvx.c b/lapack-netlib/SRC/sgbsvx.c index 44e6a8f3e1..80da1a7f4b 100644 --- a/lapack-netlib/SRC/sgbsvx.c +++ b/lapack-netlib/SRC/sgbsvx.c @@ -879,7 +879,7 @@ f"> */ /* > \ingroup realGBsolve */ /* ===================================================================== */ -/* Subroutine */ int sgbsvx_(char *fact, char *trans, integer *n, integer *kl, +/* Subroutine */ void sgbsvx_(char *fact, char *trans, integer *n, integer *kl, integer *ku, integer *nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, integer *ipiv, char *equed, real *r__, real *c__, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, @@ -898,27 +898,28 @@ f"> */ real rcmin, rcmax, anorm; logical equil; integer j1, j2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real colcnd; extern real slangb_(char *, integer *, integer *, integer *, real *, integer *, real *), slamch_(char *); - extern /* Subroutine */ int slaqgb_(integer *, integer *, integer *, + extern /* Subroutine */ void slaqgb_(integer *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, char *); logical nofact; - extern /* Subroutine */ int sgbcon_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbcon_(char *, integer *, integer *, integer *, real *, integer *, integer *, real *, real *, real *, integer * - , integer *), xerbla_(char *, integer *, ftnlen); + , integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; extern real slantb_(char *, char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sgbequ_(integer *, integer *, integer *, + extern /* Subroutine */ void sgbequ_(integer *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *); integer infequ; logical colequ; - extern /* Subroutine */ int sgbrfs_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbrfs_(char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *), sgbtrf_(integer *, integer *, @@ -927,7 +928,7 @@ f"> */ integer *); real rowcnd; logical notran; - extern /* Subroutine */ int sgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbtrs_(char *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real smlnum; @@ -1063,7 +1064,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1169,7 +1170,7 @@ f"> */ } work[1] = rpvgrw; *rcond = 0.f; - return 0; + return; } } @@ -1253,7 +1254,7 @@ f"> */ } work[1] = rpvgrw; - return 0; + return; /* End of SGBSVX */ diff --git a/lapack-netlib/SRC/sgbsvxx.c b/lapack-netlib/SRC/sgbsvxx.c index 3c373c4316..74f458c06f 100644 --- a/lapack-netlib/SRC/sgbsvxx.c +++ b/lapack-netlib/SRC/sgbsvxx.c @@ -1068,7 +1068,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realGBsolve */ /* ===================================================================== */ -/* Subroutine */ int sgbsvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void sgbsvxx_(char *fact, char *trans, integer *n, integer * kl, integer *ku, integer *nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, integer *ipiv, char *equed, real *r__, real *c__, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real * @@ -1092,7 +1092,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical equil; real colcnd; extern real slamch_(char *); - extern /* Subroutine */ int slaqgb_(integer *, integer *, integer *, + extern /* Subroutine */ void slaqgb_(integer *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, char *); logical nofact; @@ -1100,17 +1100,17 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ real bignum; integer infequ; logical colequ; - extern /* Subroutine */ int sgbtrf_(integer *, integer *, integer *, + extern /* Subroutine */ void sgbtrf_(integer *, integer *, integer *, integer *, real *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); real rowcnd; logical notran; - extern /* Subroutine */ int sgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbtrs_(char *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real smlnum; logical rowequ; - extern /* Subroutine */ int slascl2_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slascl2_(integer *, integer *, real *, real *, integer *), sgbequb_(integer *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *), sgbrfsx_(char *, char *, integer *, integer *, @@ -1257,7 +1257,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGBSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1331,7 +1331,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = sla_gbrpvgrw_(n, kl, ku, info, &ab[ab_offset], ldab, & afb[afb_offset], ldafb); - return 0; + return; } } @@ -1364,7 +1364,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ slascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of SGBSVXX */ diff --git a/lapack-netlib/SRC/sgbtf2.c b/lapack-netlib/SRC/sgbtf2.c index 61b95d383a..9324568d3f 100644 --- a/lapack-netlib/SRC/sgbtf2.c +++ b/lapack-netlib/SRC/sgbtf2.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgbtf2_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void sgbtf2_(integer *m, integer *n, integer *kl, integer *ku, real *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -668,10 +668,10 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer *); integer km, jp, ju, kv; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -716,13 +716,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Gaussian elimination with partial pivoting */ @@ -812,7 +812,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of SGBTF2 */ diff --git a/lapack-netlib/SRC/sgbtrf.c b/lapack-netlib/SRC/sgbtrf.c index 03256e0fb5..3f752287c2 100644 --- a/lapack-netlib/SRC/sgbtrf.c +++ b/lapack-netlib/SRC/sgbtrf.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgbtrf_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void sgbtrf_(integer *m, integer *n, integer *kl, integer *ku, real *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -668,17 +668,17 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real temp; integer i__, j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real work13[4160] /* was [65][64] */, work31[4160] /* was [65][ 64] */; integer i2, i3, j2, j3, k2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *), sgbtf2_(integer *, integer *, integer *, integer @@ -730,13 +730,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1131,7 +1131,7 @@ f"> */ } } - return 0; + return; /* End of SGBTRF */ diff --git a/lapack-netlib/SRC/sgbtrs.c b/lapack-netlib/SRC/sgbtrs.c index 729e8a1c00..2139fe2314 100644 --- a/lapack-netlib/SRC/sgbtrs.c +++ b/lapack-netlib/SRC/sgbtrs.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgbtrs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void sgbtrs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, real *ab, integer *ldab, integer *ipiv, real *b, integer *ldb, integer *info) { @@ -660,14 +660,14 @@ f"> */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, j, l; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical lnoti; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), stbsv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *); integer kd, lm; @@ -717,13 +717,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } kd = *ku + *kl + 1; @@ -800,7 +800,7 @@ f"> */ } } } - return 0; + return; /* End of SGBTRS */ diff --git a/lapack-netlib/SRC/sgebak.c b/lapack-netlib/SRC/sgebak.c index d6e30870ff..ad320d5d47 100644 --- a/lapack-netlib/SRC/sgebak.c +++ b/lapack-netlib/SRC/sgebak.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgebak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void sgebak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, real *scale, integer *m, real *v, integer *ldv, integer *info) { @@ -649,9 +649,9 @@ f"> */ integer i__, k; real s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical leftv; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer ii; extern /* Subroutine */ int xerbla_(char *, integer *,ftnlen); @@ -699,19 +699,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEBAK", &i__1,(ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -790,7 +790,7 @@ f"> */ } } - return 0; + return; /* End of SGEBAK */ diff --git a/lapack-netlib/SRC/sgebak.f b/lapack-netlib/SRC/sgebak.f index b51b611a95..abb7809a3b 100644 --- a/lapack-netlib/SRC/sgebak.f +++ b/lapack-netlib/SRC/sgebak.f @@ -236,7 +236,7 @@ SUBROUTINE SGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -250,7 +250,7 @@ SUBROUTINE SGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/sgebal.c b/lapack-netlib/SRC/sgebal.c index c2bc727fbe..07d5a6bf88 100644 --- a/lapack-netlib/SRC/sgebal.c +++ b/lapack-netlib/SRC/sgebal.c @@ -673,7 +673,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgebal_(char *job, integer *n, real *a, integer *lda, +/* Subroutine */ void sgebal_(char *job, integer *n, real *a, integer *lda, integer *ilo, integer *ihi, real *scale, integer *info) { /* System generated locals */ @@ -687,7 +687,7 @@ f"> */ integer i__, j, k, l, m; real r__, s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer *); real sfmin1, sfmin2, sfmax1, sfmax2, ca, ra; extern real slamch_(char *); @@ -728,7 +728,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEBAL", &i__1,(ftnlen)6); - return 0; + return; } k = 1; @@ -908,7 +908,7 @@ f"> */ *info = -3; i__2 = -(*info); xerbla_("SGEBAL", &i__2, (ftnlen)6); - return 0; + return; } f /= 2.f; c__ /= 2.f; @@ -954,7 +954,7 @@ f"> */ *ilo = k; *ihi = l; - return 0; + return; /* End of SGEBAL */ diff --git a/lapack-netlib/SRC/sgebd2.c b/lapack-netlib/SRC/sgebd2.c index f491189ba9..82fc210e5a 100644 --- a/lapack-netlib/SRC/sgebd2.c +++ b/lapack-netlib/SRC/sgebd2.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgebd2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgebd2_(integer *m, integer *n, real *a, integer *lda, real *d__, real *e, real *tauq, real *taup, real *work, integer *info) { /* System generated locals */ @@ -710,9 +710,10 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); @@ -749,7 +750,7 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("SGEBD2", &i__1, (ftnlen)6); - return 0; + return; } if (*m >= *n) { @@ -860,7 +861,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of SGEBD2 */ diff --git a/lapack-netlib/SRC/sgebrd.c b/lapack-netlib/SRC/sgebrd.c index f85f3148d1..9f1981b797 100644 --- a/lapack-netlib/SRC/sgebrd.c +++ b/lapack-netlib/SRC/sgebrd.c @@ -722,7 +722,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgebrd_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgebrd_(integer *m, integer *n, real *a, integer *lda, real *d__, real *e, real *tauq, real *taup, real *work, integer * lwork, integer *info) { @@ -731,14 +731,14 @@ f"> */ /* Local variables */ integer i__, j, nbmin, iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer minmn; - extern /* Subroutine */ int sgebd2_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgebd2_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *); integer nb, nx; - extern /* Subroutine */ int slabrd_(integer *, integer *, integer *, real + extern /* Subroutine */ void slabrd_(integer *, integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, real *, integer *); integer ws; @@ -795,9 +795,9 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("SGEBRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -805,7 +805,7 @@ f"> */ minmn = f2cmin(*m,*n); if (minmn == 0) { work[1] = 1.f; - return 0; + return; } ws = f2cmax(*m,*n); @@ -899,7 +899,7 @@ f"> */ sgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], & tauq[i__], &taup[i__], &work[1], &iinfo); work[1] = (real) ws; - return 0; + return; /* End of SGEBRD */ diff --git a/lapack-netlib/SRC/sgecon.c b/lapack-netlib/SRC/sgecon.c index ce2e078232..71224eb019 100644 --- a/lapack-netlib/SRC/sgecon.c +++ b/lapack-netlib/SRC/sgecon.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgecon_(char *norm, integer *n, real *a, integer *lda, +/* Subroutine */ void sgecon_(char *norm, integer *n, real *a, integer *lda, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { /* System generated locals */ @@ -648,7 +648,7 @@ f"> */ real scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *), + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *), slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); real sl; @@ -660,7 +660,7 @@ f"> */ real ainvnm; logical onenrm; char normin[1]; - extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatrs_(char *, char *, char *, char *, integer *, real *, integer *, real *, real *, real *, integer *); real smlnum; @@ -698,7 +698,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGECON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -706,9 +706,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -772,7 +772,7 @@ f"> */ } L20: - return 0; + return; /* End of SGECON */ diff --git a/lapack-netlib/SRC/sgecon.f b/lapack-netlib/SRC/sgecon.f index a284b094be..86aeea73bb 100644 --- a/lapack-netlib/SRC/sgecon.f +++ b/lapack-netlib/SRC/sgecon.f @@ -106,6 +106,7 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value +*> =-5: if ANORM is NAN or negative. *> \endverbatim * * Authors: @@ -152,10 +153,10 @@ SUBROUTINE SGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, INTEGER ISAVE( 3 ) * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN INTEGER ISAMAX REAL SLAMCH - EXTERNAL LSAME, ISAMAX, SLAMCH + EXTERNAL LSAME, ISAMAX, SLAMCH, SISNAN * .. * .. External Subroutines .. EXTERNAL SLACN2, SLATRS, SRSCL, XERBLA @@ -175,7 +176,7 @@ SUBROUTINE SGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK, INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO ) THEN + ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/sgeequ.c b/lapack-netlib/SRC/sgeequ.c index 040949a7c1..d890f787d4 100644 --- a/lapack-netlib/SRC/sgeequ.c +++ b/lapack-netlib/SRC/sgeequ.c @@ -647,7 +647,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgeequ_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeequ_(integer *m, integer *n, real *a, integer *lda, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -693,7 +693,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -702,7 +702,7 @@ f"> */ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. */ @@ -756,7 +756,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -826,7 +826,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -849,7 +849,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of SGEEQU */ diff --git a/lapack-netlib/SRC/sgeequb.c b/lapack-netlib/SRC/sgeequb.c index a90ad2f1ff..9515a5df8a 100644 --- a/lapack-netlib/SRC/sgeequb.c +++ b/lapack-netlib/SRC/sgeequb.c @@ -654,7 +654,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgeequb_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeequb_(integer *m, integer *n, real *a, integer *lda, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, integer *info) { @@ -700,7 +700,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGEEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -709,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.f; *colcnd = 1.f; *amax = 0.f; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -772,7 +772,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.f) { *info = i__; - return 0; + return; } /* L50: */ } @@ -846,7 +846,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.f) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -869,7 +869,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of SGEEQUB */ diff --git a/lapack-netlib/SRC/sgees.c b/lapack-netlib/SRC/sgees.c index 54ede98d60..ecd148f124 100644 --- a/lapack-netlib/SRC/sgees.c +++ b/lapack-netlib/SRC/sgees.c @@ -729,7 +729,7 @@ or GE matrices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgees_(char *jobvs, char *sort, L_fp select, integer *n, +/* Subroutine */ void sgees_(char *jobvs, char *sort, L_fp select, integer *n, real *a, integer *lda, integer *sdim, real *wr, real *wi, real *vs, integer *ldvs, real *work, integer *lwork, logical *bwork, integer * info) @@ -746,37 +746,37 @@ or GE matrices */ extern logical lsame_(char *, char *); logical cursl; integer i1, i2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical lst2sl; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); logical scalea; integer ip; real cscale; - extern /* Subroutine */ int sgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *), sgebal_(char *, integer *, real *, integer *, integer *, integer *, real *, integer *); extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real - *, integer *, real *, real *, integer *, integer *), xerbla_(char - *, integer *, ftnlen); + extern /* Subroutine */ void sgehrd_(integer *, integer *, integer *, real + *, integer *, real *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical lastsl; - extern /* Subroutine */ int sorghr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorghr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), shseqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *); integer minwrk, maxwrk; real smlnum; integer hswork; - extern /* Subroutine */ int strsen_(char *, char *, logical *, integer *, + extern /* Subroutine */ void strsen_(char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, real *, integer *, real *, real *, real *, integer *, integer *, integer *, integer * ); @@ -873,16 +873,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEES ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1117,7 +1117,7 @@ or GE matrices */ } work[1] = (real) maxwrk; - return 0; + return; /* End of SGEES */ diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f index d40503f899..6febd549cf 100644 --- a/lapack-netlib/SRC/sgees.f +++ b/lapack-netlib/SRC/sgees.f @@ -302,7 +302,7 @@ SUBROUTINE SGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, WR, WI, * CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/sgeesx.c b/lapack-netlib/SRC/sgeesx.c index 0c9f1832ad..6f6a4fb539 100644 --- a/lapack-netlib/SRC/sgeesx.c +++ b/lapack-netlib/SRC/sgeesx.c @@ -793,7 +793,7 @@ f"> */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgeesx_(char *jobvs, char *sort, L_fp select, char * +/* Subroutine */ void sgeesx_(char *jobvs, char *sort, L_fp select, char * sense, integer *n, real *a, integer *lda, integer *sdim, real *wr, real *wi, real *vs, integer *ldvs, real *rconde, real *rcondv, real * work, integer *lwork, integer *iwork, integer *liwork, logical *bwork, @@ -809,31 +809,31 @@ f"> */ extern logical lsame_(char *, char *); logical cursl; integer liwrk, i1, i2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical lst2sl; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); logical scalea; integer ip; real cscale; - extern /* Subroutine */ int sgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *), sgebal_(char *, integer *, real *, integer *, integer *, integer *, real *, integer *); extern real slamch_(char *); - extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real - *, integer *, real *, real *, integer *, integer *), xerbla_(char - *, integer *, ftnlen); + extern /* Subroutine */ void sgehrd_(integer *, integer *, integer *, real + *, integer *, real *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern real slange_(char *, integer *, integer *, real *, integer *, real *); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical wantsb, wantse, lastsl; - extern /* Subroutine */ int sorghr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorghr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), shseqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *); @@ -841,7 +841,7 @@ f"> */ logical wantsn; real smlnum; integer hswork; - extern /* Subroutine */ int strsen_(char *, char *, logical *, integer *, + extern /* Subroutine */ void strsen_(char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, real *, integer *, real *, real *, real *, integer *, integer *, integer *, integer * ); @@ -964,16 +964,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1236,7 +1236,7 @@ f"> */ iwork[1] = 1; } - return 0; + return; /* End of SGEESX */ diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f index 27c4338d40..6810fe7c80 100644 --- a/lapack-netlib/SRC/sgeesx.f +++ b/lapack-netlib/SRC/sgeesx.f @@ -382,7 +382,7 @@ SUBROUTINE SGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, * CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = WORK( 1 ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, N + HSWORK ) diff --git a/lapack-netlib/SRC/sgeev.c b/lapack-netlib/SRC/sgeev.c index 608f274cf5..891eb89725 100644 --- a/lapack-netlib/SRC/sgeev.c +++ b/lapack-netlib/SRC/sgeev.c @@ -706,7 +706,7 @@ ices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgeev_(char *jobvl, char *jobvr, integer *n, real *a, +/* Subroutine */ void sgeev_(char *jobvl, char *jobvr, integer *n, real *a, integer *lda, real *wr, real *wi, real *vl, integer *ldvl, real *vr, integer *ldvr, real *work, integer *lwork, integer *info) { @@ -720,35 +720,35 @@ ices */ char side[1]; real anrm; integer ierr, itau, iwrk, nout; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); extern real snrm2_(integer *, real *, integer *); integer i__, k; real r__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); extern real slapy2_(real *, real *); real cs; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); logical scalea; real cscale; - extern /* Subroutine */ int sgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *), sgebal_(char *, integer *, real *, integer *, integer *, integer *, real *, integer *); real sn; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real - *, integer *, real *, real *, integer *, integer *), xerbla_(char - *, integer *, ftnlen); + extern /* Subroutine */ void sgehrd_(integer *, integer *, integer *, real + *, integer *, real *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical select[1]; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slartg_(real *, real *, real *, real *, real *), sorghr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), shseqr_( @@ -759,7 +759,7 @@ ices */ real smlnum; integer hswork; logical lquery, wantvr; - extern /* Subroutine */ int strevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void strevc3_(char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); integer ihi; @@ -900,15 +900,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEEV ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1145,7 +1145,7 @@ ices */ } work[1] = (real) maxwrk; - return 0; + return; /* End of SGEEV */ diff --git a/lapack-netlib/SRC/sgeevx.c b/lapack-netlib/SRC/sgeevx.c index 8965356a51..2409d7323d 100644 --- a/lapack-netlib/SRC/sgeevx.c +++ b/lapack-netlib/SRC/sgeevx.c @@ -819,7 +819,7 @@ f"> */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgeevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void sgeevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, real *a, integer *lda, real *wr, real *wi, real * vl, integer *ldvl, real *vr, integer *ldvr, integer *ilo, integer * ihi, real *scale, real *abnrm, real *rconde, real *rcondv, real *work, @@ -834,43 +834,43 @@ f"> */ char side[1]; real anrm; integer ierr, itau, iwrk, nout; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); extern real snrm2_(integer *, real *, integer *); integer i__, k; real r__; integer icond; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); extern real slapy2_(real *, real *); real cs; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); logical scalea; real cscale; - extern /* Subroutine */ int sgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgebak_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *), sgebal_(char *, integer *, real *, integer *, integer *, integer *, real *, integer *); real sn; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real - *, integer *, real *, real *, integer *, integer *), xerbla_(char - *, integer *, ftnlen); + extern /* Subroutine */ void sgehrd_(integer *, integer *, integer *, real + *, integer *, real *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical select[1]; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slartg_(real *, real *, real *, real *, real *), sorghr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), shseqr_( char *, char *, integer *, integer *, integer *, real *, integer * , real *, real *, real *, integer *, real *, integer *, integer *); integer minwrk, maxwrk; - extern /* Subroutine */ int strsna_(char *, char *, logical *, integer *, + extern /* Subroutine */ void strsna_(char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *, integer *); @@ -879,7 +879,7 @@ f"> */ logical wntsne; real smlnum; logical lquery, wantvr, wntsnn, wntsnv; - extern /* Subroutine */ int strevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void strevc3_(char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); char job[1]; @@ -1042,15 +1042,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1313,7 +1313,7 @@ f"> */ } work[1] = (real) maxwrk; - return 0; + return; /* End of SGEEVX */ diff --git a/lapack-netlib/SRC/sgehd2.c b/lapack-netlib/SRC/sgehd2.c index 465e06ac6c..76973e11c0 100644 --- a/lapack-netlib/SRC/sgehd2.c +++ b/lapack-netlib/SRC/sgehd2.c @@ -663,7 +663,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgehd2_(integer *n, integer *ilo, integer *ihi, real *a, +/* Subroutine */ void sgehd2_(integer *n, integer *ilo, integer *ihi, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -671,10 +671,10 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, - integer *, real *); + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); real aii; @@ -710,7 +710,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEHD2", &i__1, (ftnlen)6); - return 0; + return; } i__1 = *ihi - 1; @@ -743,7 +743,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of SGEHD2 */ diff --git a/lapack-netlib/SRC/sgehrd.c b/lapack-netlib/SRC/sgehrd.c index 2c990165f6..2b0172e321 100644 --- a/lapack-netlib/SRC/sgehrd.c +++ b/lapack-netlib/SRC/sgehrd.c @@ -686,7 +686,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgehrd_(integer *n, integer *ilo, integer *ihi, real *a, +/* Subroutine */ void sgehrd_(integer *n, integer *ilo, integer *ihi, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -694,7 +694,7 @@ f"> */ /* Local variables */ integer i__, j, nbmin, iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, @@ -706,9 +706,10 @@ f"> */ integer ib; real ei; integer nb, nh, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *,ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *,ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -764,9 +765,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEHRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ @@ -787,7 +788,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1] = 1.f; - return 0; + return; } /* Determine the block size */ @@ -898,7 +899,7 @@ f"> */ sgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo); work[1] = (real) lwkopt; - return 0; + return; /* End of SGEHRD */ diff --git a/lapack-netlib/SRC/sgejsv.c b/lapack-netlib/SRC/sgejsv.c index c985f160bb..0820b0dfba 100644 --- a/lapack-netlib/SRC/sgejsv.c +++ b/lapack-netlib/SRC/sgejsv.c @@ -991,7 +991,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgejsv_(char *joba, char *jobu, char *jobv, char *jobr, +/* Subroutine */ void sgejsv_(char *joba, char *jobu, char *jobv, char *jobr, char *jobt, char *jobp, integer *m, integer *n, real *a, integer *lda, real *sva, real *u, integer *ldu, real *v, integer *ldv, real *work, integer *lwork, integer *iwork, integer *info) @@ -1011,23 +1011,23 @@ f"> */ integer p, q; logical jracc; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real small, entra, sfmin; logical lsvec; real epsln; logical rsvec; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n1; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); logical l2aber; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); real condr1, condr2, uscal1, uscal2; logical l2kill, l2rank, l2tran; - extern /* Subroutine */ int sgeqp3_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqp3_(integer *, integer *, real *, integer *, integer *, real *, real *, integer *, integer *); logical l2pert; integer nr; @@ -1038,10 +1038,10 @@ f"> */ real aatmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical noscal; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, @@ -1049,16 +1049,17 @@ f"> */ real entrat; logical almort; real maxprj; - extern /* Subroutine */ int spocon_(char *, integer *, real *, integer *, + extern /* Subroutine */ void spocon_(char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *); logical errest; - extern /* Subroutine */ int sgesvj_(char *, char *, char *, integer *, + extern /* Subroutine */ void sgesvj_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, integer *), slassq_( integer *, real *, integer *, real *, real *); logical transp; extern /* Subroutine */ int slaswp_(integer *, real *, integer *, integer - *, integer *, integer *, integer *), sorgqr_(integer *, integer *, + *, integer *, integer *, integer *); + extern void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, @@ -1168,7 +1169,7 @@ f"> */ /* #:( */ i__1 = -(*info); xerbla_("SGEJSV", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return for void matrix (Y3K safe) */ @@ -1184,7 +1185,7 @@ f"> */ work[5] = 0.f; work[6] = 0.f; work[7] = 0.f; - return 0; + return; } /* Determine whether the matrix U should be M x N or M x M */ @@ -1224,7 +1225,7 @@ f"> */ *info = -9; i__2 = -(*info); xerbla_("SGEJSV", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscal) { @@ -1286,7 +1287,7 @@ f"> */ iwork[1] = 0; iwork[2] = 0; iwork[3] = 0; - return 0; + return; } /* Issue warning if denormalized column norms detected. Override the */ @@ -1351,7 +1352,7 @@ f"> */ work[6] = 0.f; work[7] = 0.f; } - return 0; + return; } @@ -2797,6 +2798,6 @@ f"> */ iwork[2] = numrank; iwork[3] = warning; - return 0; + return; } /* sgejsv_ */ diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f index 82ac6b94b5..923573bdb4 100644 --- a/lapack-netlib/SRC/sgejsv.f +++ b/lapack-netlib/SRC/sgejsv.f @@ -224,7 +224,7 @@ *> *> \param[out] U *> \verbatim -*> U is REAL array, dimension ( LDU, N ) +*> U is REAL array, dimension ( LDU, N ) or ( LDU, M ) *> If JOBU = 'U', then U contains on exit the M-by-N matrix of *> the left singular vectors. *> If JOBU = 'F', then U contains on exit the M-by-M matrix of @@ -271,7 +271,7 @@ *> *> \param[out] WORK *> \verbatim -*> WORK is REAL array, dimension (LWORK) +*> WORK is REAL array, dimension (MAX(7,LWORK)) *> On exit, *> WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such *> that SCALE*SVA(1:N) are the computed singular values @@ -318,36 +318,36 @@ *> LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement. *> ->> For optimal performance (blocked code) the optimal value *> is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal -*> block size for DGEQP3 and DGEQRF. +*> block size for SGEQP3 and SGEQRF. *> In general, optimal LWORK is computed as -*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), 7). +*> LWORK >= max(2*M+N,N+LWORK(SGEQP3),N+LWORK(SGEQRF), 7). *> -> .. an estimate of the scaled condition number of A is *> required (JOBA='E', 'G'). In this case, LWORK is the maximum *> of the above and N*N+4*N, i.e. LWORK >= max(2*M+N,N*N+4*N,7). *> ->> For optimal performance (blocked code) the optimal value *> is LWORK >= max(2*M+N,3*N+(N+1)*NB, N*N+4*N, 7). *> In general, the optimal length LWORK is computed as -*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), -*> N+N*N+LWORK(DPOCON),7). +*> LWORK >= max(2*M+N,N+LWORK(SGEQP3),N+LWORK(SGEQRF), +*> N+N*N+LWORK(SPOCON),7). *> *> If SIGMA and the right singular vectors are needed (JOBV = 'V'), *> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). *> -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7), -*> where NB is the optimal block size for DGEQP3, DGEQRF, DGELQ, -*> DORMLQ. In general, the optimal length LWORK is computed as -*> LWORK >= max(2*M+N,N+LWORK(DGEQP3), N+LWORK(DPOCON), -*> N+LWORK(DGELQ), 2*N+LWORK(DGEQRF), N+LWORK(DORMLQ)). +*> where NB is the optimal block size for SGEQP3, SGEQRF, SGELQF, +*> SORMLQ. In general, the optimal length LWORK is computed as +*> LWORK >= max(2*M+N,N+LWORK(SGEQP3), N+LWORK(SPOCON), +*> N+LWORK(SGELQF), 2*N+LWORK(SGEQRF), N+LWORK(SORMLQ)). *> *> If SIGMA and the left singular vectors are needed *> -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7). *> -> For optimal performance: *> if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7), *> if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7), -*> where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR. +*> where NB is the optimal block size for SGEQP3, SGEQRF, SORMQR. *> In general, the optimal length LWORK is computed as -*> LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON), -*> 2*N+LWORK(DGEQRF), N+LWORK(DORMQR)). -*> Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or +*> LWORK >= max(2*M+N,N+LWORK(SGEQP3),N+LWORK(SPOCON), +*> 2*N+LWORK(SGEQRF), N+LWORK(SORMQR)). +*> Here LWORK(SORMQR) equals N*NB (for JOBU = 'U') or *> M*NB (for JOBU = 'F'). *> *> If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and @@ -357,12 +357,12 @@ *> LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6). *> -> For optimal performance, LWORK should be additionally *> larger than N+M*NB, where NB is the optimal block size -*> for DORMQR. +*> for SORMQR. *> \endverbatim *> *> \param[out] IWORK *> \verbatim -*> IWORK is INTEGER array, dimension (M+3*N). +*> IWORK is INTEGER array, dimension (MAX(3,M+3*N)). *> On exit, *> IWORK(1) = the numerical rank determined after the initial *> QR factorization with pivoting. See the descriptions diff --git a/lapack-netlib/SRC/sgelq.c b/lapack-netlib/SRC/sgelq.c index 282695fd2b..55d7079b60 100644 --- a/lapack-netlib/SRC/sgelq.c +++ b/lapack-netlib/SRC/sgelq.c @@ -681,7 +681,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgelq_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgelq_(integer *m, integer *n, real *a, integer *lda, real *t, integer *tsize, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -693,11 +693,11 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgelqt_(integer *, integer *, integer *, real + extern /* Subroutine */ void sgelqt_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int slaswlq_(integer *, integer *, integer *, + extern /* Subroutine */ void slaswlq_(integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , integer *); @@ -837,15 +837,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("SGELQ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -858,7 +858,7 @@ static integer c__2 = 2; } work[1] = (real) lwreq; - return 0; + return; /* End of SGELQ */ diff --git a/lapack-netlib/SRC/sgelq2.c b/lapack-netlib/SRC/sgelq2.c index 3a65795fbc..0baa8a4456 100644 --- a/lapack-netlib/SRC/sgelq2.c +++ b/lapack-netlib/SRC/sgelq2.c @@ -639,7 +639,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgelq2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgelq2_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -647,10 +647,10 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, - integer *, real *); + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); real aii; @@ -684,7 +684,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGELQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -713,7 +713,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of SGELQ2 */ diff --git a/lapack-netlib/SRC/sgelqf.c b/lapack-netlib/SRC/sgelqf.c index c5dfa93261..f013d4f645 100644 --- a/lapack-netlib/SRC/sgelqf.c +++ b/lapack-netlib/SRC/sgelqf.c @@ -659,7 +659,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgelqf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgelqf_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -667,15 +667,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int sgelq2_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelq2_(integer *, integer *, real *, integer *, real *, real *, integer *); integer ib, nb, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -719,9 +720,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGELQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -729,7 +730,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -814,7 +815,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SGELQF */ diff --git a/lapack-netlib/SRC/sgelqt.c b/lapack-netlib/SRC/sgelqt.c index d28d4d2a21..5c5efdb6f5 100644 --- a/lapack-netlib/SRC/sgelqt.c +++ b/lapack-netlib/SRC/sgelqt.c @@ -630,7 +630,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgelqt_(integer *m, integer *n, integer *mb, real *a, +/* Subroutine */ void sgelqt_(integer *m, integer *n, integer *mb, real *a, integer *lda, real *t, integer *ldt, real *work, integer *info) { /* System generated locals */ @@ -638,9 +638,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen), sgelqt3_( + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void sgelqt3_( integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -681,14 +683,14 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGELQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -717,7 +719,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ i__ * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of SGELQT */ diff --git a/lapack-netlib/SRC/sgelqt3.c b/lapack-netlib/SRC/sgelqt3.c index 3ea1f3b9a2..fdb918647b 100644 --- a/lapack-netlib/SRC/sgelqt3.c +++ b/lapack-netlib/SRC/sgelqt3.c @@ -627,7 +627,7 @@ static real c_b19 = -1.f; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgelqt3_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgelqt3_(integer *m, integer *n, real *a, integer *lda, real *t, integer *ldt, integer *info) { /* System generated locals */ @@ -635,13 +635,15 @@ static real c_b19 = -1.f; /* Local variables */ integer i__, j, iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer i1, j1, m1, m2; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen), slarfg_(integer *, real *, real *, integer *, real *); + ); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); /* -- LAPACK computational routine (version 3.8.0) -- */ @@ -675,7 +677,7 @@ static real c_b19 = -1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SGELQT3", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 1) { @@ -773,7 +775,7 @@ static real c_b19 = -1.f; } - return 0; + return; /* End of SGELQT3 */ diff --git a/lapack-netlib/SRC/sgels.c b/lapack-netlib/SRC/sgels.c index a6cdf047a0..7817f09a8c 100644 --- a/lapack-netlib/SRC/sgels.c +++ b/lapack-netlib/SRC/sgels.c @@ -698,7 +698,7 @@ static integer c__0 = 0; /* > \ingroup realGEsolve */ /* ===================================================================== */ -/* Subroutine */ int sgels_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void sgels_(char *trans, integer *m, integer *n, integer * nrhs, real *a, integer *lda, real *b, integer *ldb, real *work, integer *lwork, integer *info) { @@ -714,7 +714,7 @@ static integer c__0 = 0; integer wsize; real rwork[1]; integer nb; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer mn; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); @@ -723,20 +723,21 @@ static integer c__0 = 0; integer *, integer *, ftnlen, ftnlen); integer scllen; real bignum; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real smlnum; - extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, - integer *, integer *), strtrs_(char *, char *, + integer *, integer *); + extern int strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer * , integer *); @@ -838,9 +839,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("SGELS ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -850,7 +851,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); slaset_("Full", &i__1, nrhs, &c_b33, &c_b33, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -936,7 +937,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; @@ -951,7 +952,7 @@ static integer c__0 = 0; lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = ZERO */ @@ -998,7 +999,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1041,7 +1042,7 @@ static integer c__0 = 0; lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1070,7 +1071,7 @@ static integer c__0 = 0; L50: work[1] = (real) wsize; - return 0; + return; /* End of SGELS */ diff --git a/lapack-netlib/SRC/sgelsd.c b/lapack-netlib/SRC/sgelsd.c index 7a195bec7c..60e9a6b629 100644 --- a/lapack-netlib/SRC/sgelsd.c +++ b/lapack-netlib/SRC/sgelsd.c @@ -728,7 +728,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int sgelsd_(integer *m, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sgelsd_(integer *m, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *s, real *rcond, integer * rank, real *work, integer *lwork, integer *iwork, integer *info) { @@ -740,9 +740,9 @@ f"> */ integer itau, nlvl, iascl, ibscl; real sfmin; integer minmn, maxmn, itaup, itauq, mnthr, nwork, ie, il; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer mm; - extern /* Subroutine */ int sgebrd_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgebrd_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, integer *); extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); @@ -750,30 +750,30 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slalsd_(char *, integer *, integer *, integer *, real *, real *, real *, integer *, real * , integer *, real *, integer *, integer *), slascl_(char * , integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer wlalsd; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer ldwork; - extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormbr_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , real *, integer *, integer *); integer liwork, minwrk, maxwrk; real smlnum; - extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); logical lquery; integer smlsiz; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real eps; @@ -975,9 +975,9 @@ fprintf(stdout,"start of SGELSD\n"); if (*info != 0) { i__1 = -(*info); xerbla_("SGELSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -985,7 +985,7 @@ fprintf(stdout,"start of SGELSD\n"); if (*m == 0 || *n == 0) { fprintf(stdout,"SGELSD quickreturn rank=0\n"); *rank = 0; - return 0; + return; } /* Get machine parameters. */ @@ -1293,7 +1293,7 @@ fprintf(stdout,"other underdetermined, path 2"); work[1] = (real) maxwrk; iwork[1] = liwork; fprintf(stdout, "end of SGELSD\n"); - return 0; + return; /* End of SGELSD */ diff --git a/lapack-netlib/SRC/sgelss.c b/lapack-netlib/SRC/sgelss.c index 9e6bec2d19..bf92f6d0c5 100644 --- a/lapack-netlib/SRC/sgelss.c +++ b/lapack-netlib/SRC/sgelss.c @@ -689,7 +689,7 @@ f"> */ /* > \ingroup realGEsolve */ /* ===================================================================== */ -/* Subroutine */ int sgelss_(integer *m, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sgelss_(integer *m, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *s, real *rcond, integer * rank, real *work, integer *lwork, integer *info) { @@ -702,22 +702,22 @@ f"> */ integer itau, lwork_sgebrd__, lwork_sgeqrf__, i__, lwork_sorgbr__, lwork_sormbr__, lwork_sormlq__, iascl, ibscl, lwork_sormqr__, chunk; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real sfmin; integer minmn, maxmn; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer itaup, itauq; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); integer mnthr, iwork; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer bl, ie, il; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer mm, bdspac; - extern /* Subroutine */ int sgebrd_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgebrd_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, integer *); extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); @@ -725,7 +725,7 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgeqrf_(integer *, integer *, real @@ -737,16 +737,16 @@ f"> */ char *, integer *, integer *, integer *, real *, integer *, real * , real *, integer *, integer *); integer ldwork; - extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormbr_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , real *, integer *, integer *); integer minwrk, maxwrk; real smlnum; - extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real dum[1], eps, thr; @@ -968,16 +968,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGELSS", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1424,7 +1424,7 @@ f"> */ L70: work[1] = (real) maxwrk; - return 0; + return; /* End of SGELSS */ diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f index be9e2ea116..9aed4329f7 100644 --- a/lapack-netlib/SRC/sgelss.f +++ b/lapack-netlib/SRC/sgelss.f @@ -253,11 +253,11 @@ SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * * Compute space needed for SGEQRF CALL SGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_SGEQRF=DUM(1) + LWORK_SGEQRF = INT( DUM(1) ) * Compute space needed for SORMQR CALL SORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_SORMQR=DUM(1) + LWORK_SORMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + LWORK_SGEQRF ) MAXWRK = MAX( MAXWRK, N + LWORK_SORMQR ) @@ -272,15 +272,15 @@ SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for SGEBRD CALL SGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 3*N + LWORK_SGEBRD ) MAXWRK = MAX( MAXWRK, 3*N + LWORK_SORMBR ) @@ -304,19 +304,19 @@ SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for SGEBRD CALL SGEBRD( M, M, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) * Compute space needed for SORMLQ CALL SORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_SORMLQ=DUM(1) + LWORK_SORMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + M*ILAENV( 1, 'SGELQF', ' ', M, N, -1, $ -1 ) @@ -337,15 +337,15 @@ SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for SGEBRD CALL SGEBRD( M, N, A, LDA, S, DUM(1), DUM(1), $ DUM(1), DUM(1), -1, INFO ) - LWORK_SGEBRD=DUM(1) + LWORK_SGEBRD = INT( DUM(1) ) * Compute space needed for SORMBR CALL SORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_SORMBR=DUM(1) + LWORK_SORMBR = INT( DUM(1) ) * Compute space needed for SORGBR CALL SORGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_SORGBR=DUM(1) + LWORK_SORGBR = INT( DUM(1) ) MAXWRK = 3*M + LWORK_SGEBRD MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORMBR ) MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORGBR ) diff --git a/lapack-netlib/SRC/sgelst.c b/lapack-netlib/SRC/sgelst.c new file mode 100644 index 0000000000..7e17c542c3 --- /dev/null +++ b/lapack-netlib/SRC/sgelst.c @@ -0,0 +1,1100 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download SGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* REAL A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */ +/* > factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'T' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'T' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'T': the linear system involves A**T. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by SGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by SGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'T'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'T' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup realGEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ void sgelst_(char *trans, integer *m, integer *n, integer * + nrhs, real *a, integer *lda, real *b, integer *ldb, real *work, + integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2; + + /* Local variables */ + real anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + real rwork[1]; + integer lwopt, nb; + extern /* Subroutine */ void slabad_(real *, real *); + integer mn; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + real bignum; + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, + real *, integer *), sgelqt_(integer *, integer *, integer + *, real *, integer *, real *, integer *, real *, integer *); + integer mnnrhs; + extern /* Subroutine */ void sgeqrt_(integer *, integer *, integer *, real + *, integer *, real *, integer *, real *, integer *); + real smlnum; + logical lquery; + extern /* Subroutine */ int strtrs_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *, integer *); + extern void sgemlqt_(char *, char *, integer *, + integer *, integer *, integer *, real *, integer *, real *, + integer *, real *, integer *, real *, integer *), + sgemqrt_(char *, char *, integer *, integer *, integer *, integer + *, real *, integer *, real *, integer *, real *, integer *, real * + , integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "T"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "SGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + work[1] = (real) lwopt; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGELST ", &i__1, 6); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (real) lwopt; + return; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = slamch_("S") / slamch_("P"); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = slange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb); + work[1] = (real) lwopt; + return; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = slange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + slascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + slascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + sgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + strtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + strtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.f; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + sgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + strtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + b[i__ + j * b_dim1] = 0.f; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, + &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + sgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + strtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], + lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + slascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + slascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + slascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + slascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + work[1] = (real) lwopt; + + return; + +/* End of SGELST */ + +} /* sgelst_ */ + diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f new file mode 100644 index 0000000000..5377bc720a --- /dev/null +++ b/lapack-netlib/SRC/sgelst.f @@ -0,0 +1,531 @@ +*> \brief SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its transpose, using a QR or LQ +*> factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'T' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'T' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'T': the linear system involves A**T. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by SGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by SGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'T'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'T' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup realGEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + REAL ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + REAL RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE + EXTERNAL LSAME, ILAENV, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD, + $ SLASCL, SLASET, STRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'SGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = REAL( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'SGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL SLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = SLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) + WORK( 1 ) = REAL( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = SLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL SLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL SLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL SGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ), + $ INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL STRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL STRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL SGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL STRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA, + $ WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL SGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL STRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL SLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL SLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = REAL( LWOPT ) +* + RETURN +* +* End of SGELST +* + END diff --git a/lapack-netlib/SRC/sgelsy.c b/lapack-netlib/SRC/sgelsy.c index 81f76c6600..25052cb7c4 100644 --- a/lapack-netlib/SRC/sgelsy.c +++ b/lapack-netlib/SRC/sgelsy.c @@ -721,7 +721,7 @@ f"> */ /* > G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgelsy_(integer *m, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sgelsy_(integer *m, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, integer *jpvt, real *rcond, integer *rank, real *work, integer *lwork, integer *info) { @@ -733,17 +733,17 @@ f"> */ real anrm, bnrm, smin, smax; integer i__, j, iascl, ibscl, ismin, ismax; real c1, c2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real wsize, s1, s2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), slaic1_(integer *, integer *, real *, real *, real *, real *, real *, real *, real *), sgeqp3_( integer *, integer *, real *, integer *, integer *, real *, real * , integer *, integer *); integer nb; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer mn; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); @@ -751,14 +751,14 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer lwkmin, nb1, nb2, nb3, nb4; real sminpr, smaxpr, smlnum; integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), sormrz_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, @@ -847,16 +847,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGELSY", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (mn == 0 || *nrhs == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1055,7 +1055,7 @@ f"> */ L70: work[1] = (real) lwkopt; - return 0; + return; /* End of SGELSY */ diff --git a/lapack-netlib/SRC/sgemlq.c b/lapack-netlib/SRC/sgemlq.c index eabe265348..0d17bbfb0e 100644 --- a/lapack-netlib/SRC/sgemlq.c +++ b/lapack-netlib/SRC/sgemlq.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgemlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sgemlq_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *t, integer *tsize, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -682,7 +682,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int slamswlq_(char *, char *, integer *, integer * + extern /* Subroutine */ void slamswlq_(char *, char *, integer *, integer * , integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); extern logical lsame_(char *, char *); @@ -690,7 +690,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int sgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemlqt_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *); @@ -771,9 +771,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGEMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -781,7 +781,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -797,7 +797,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1] = (real) lw; - return 0; + return; /* End of SGEMLQ */ diff --git a/lapack-netlib/SRC/sgemlqt.c b/lapack-netlib/SRC/sgemlqt.c index aee0ead05c..f5c9609f12 100644 --- a/lapack-netlib/SRC/sgemlqt.c +++ b/lapack-netlib/SRC/sgemlqt.c @@ -658,7 +658,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgemlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sgemlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *mb, real *v, integer *ldv, real *t, integer * ldt, real *c__, integer *ldc, real *work, integer *info) { @@ -672,9 +672,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; @@ -736,12 +737,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGEMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -802,7 +803,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of SGEMLQT */ diff --git a/lapack-netlib/SRC/sgemqr.c b/lapack-netlib/SRC/sgemqr.c index 8cb3e28420..3788ab32e2 100644 --- a/lapack-netlib/SRC/sgemqr.c +++ b/lapack-netlib/SRC/sgemqr.c @@ -675,7 +675,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgemqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sgemqr_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *t, integer *tsize, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -684,7 +684,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int slamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void slamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); extern logical lsame_(char *, char *); @@ -692,7 +692,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int sgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemqrt_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *); @@ -773,9 +773,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGEMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -783,7 +783,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -799,7 +799,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1] = (real) lw; - return 0; + return; /* End of SGEMQR */ diff --git a/lapack-netlib/SRC/sgemqrt.c b/lapack-netlib/SRC/sgemqrt.c index f7adc3fd8d..b078373ca2 100644 --- a/lapack-netlib/SRC/sgemqrt.c +++ b/lapack-netlib/SRC/sgemqrt.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgemqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sgemqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *nb, real *v, integer *ldv, real *t, integer * ldt, real *c__, integer *ldc, real *work, integer *info) { @@ -690,9 +690,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; @@ -756,12 +757,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGEMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -822,7 +823,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of SGEMQRT */ diff --git a/lapack-netlib/SRC/sgeql2.c b/lapack-netlib/SRC/sgeql2.c index c094058a83..9cc40ab858 100644 --- a/lapack-netlib/SRC/sgeql2.c +++ b/lapack-netlib/SRC/sgeql2.c @@ -637,7 +637,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeql2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeql2_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -645,10 +645,10 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, - integer *, real *); + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); real aii; @@ -682,7 +682,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQL2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -707,7 +707,7 @@ f"> */ a[*m - k + i__ + (*n - k + i__) * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of SGEQL2 */ diff --git a/lapack-netlib/SRC/sgeqlf.c b/lapack-netlib/SRC/sgeqlf.c index eccd5a7e0c..9a20d6c1bd 100644 --- a/lapack-netlib/SRC/sgeqlf.c +++ b/lapack-netlib/SRC/sgeqlf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqlf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqlf_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -662,15 +662,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int sgeql2_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeql2_(integer *, integer *, real *, integer *, real *, real *, integer *); integer ib, nb, ki, kk, mu, nu, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -725,15 +726,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQLF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -825,7 +826,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SGEQLF */ diff --git a/lapack-netlib/SRC/sgeqp3.c b/lapack-netlib/SRC/sgeqp3.c index 0c1af70343..49c3591711 100644 --- a/lapack-netlib/SRC/sgeqp3.c +++ b/lapack-netlib/SRC/sgeqp3.c @@ -667,7 +667,7 @@ f"> */ /* > X. Sun, Computer Science Dept., Duke University, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqp3_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqp3_(integer *m, integer *n, real *a, integer *lda, integer *jpvt, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -677,22 +677,22 @@ f"> */ integer nfxd; extern real snrm2_(integer *, real *, integer *); integer j, nbmin, minmn, minws; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), slaqp2_(integer *, integer *, integer *, real *, integer *, integer *, real *, real *, real *, real *); integer jb, na, nb, sm, sn, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer topbmn, sminmn; - extern /* Subroutine */ int slaqps_(integer *, integer *, integer *, + extern /* Subroutine */ void slaqps_(integer *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, real * , real *, real *, real *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer fjb, iws; @@ -749,9 +749,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQP3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Move initial columns up front. */ @@ -905,7 +905,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SGEQP3 */ diff --git a/lapack-netlib/SRC/sgeqr.c b/lapack-netlib/SRC/sgeqr.c index 14b4641354..6cf01efa4e 100644 --- a/lapack-netlib/SRC/sgeqr.c +++ b/lapack-netlib/SRC/sgeqr.c @@ -683,7 +683,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqr_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqr_(integer *m, integer *n, real *a, integer *lda, real *t, integer *tsize, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -695,11 +695,11 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgeqrt_(integer *, integer *, integer *, real + extern /* Subroutine */ void sgeqrt_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int slatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void slatsqr_(integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , integer *); @@ -825,15 +825,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("SGEQR", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -849,7 +849,7 @@ static integer c__2 = 2; i__1 = 1, i__2 = nb * *n; work[1] = (real) f2cmax(i__1,i__2); - return 0; + return; /* End of SGEQR */ diff --git a/lapack-netlib/SRC/sgeqr2.c b/lapack-netlib/SRC/sgeqr2.c index 43408f514a..92b20913bf 100644 --- a/lapack-netlib/SRC/sgeqr2.c +++ b/lapack-netlib/SRC/sgeqr2.c @@ -644,7 +644,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqr2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqr2_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -652,10 +652,10 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, - integer *, real *); + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); real aii; @@ -689,7 +689,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQR2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -718,7 +718,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of SGEQR2 */ diff --git a/lapack-netlib/SRC/sgeqr2p.c b/lapack-netlib/SRC/sgeqr2p.c index 58accdd845..8f39af9d98 100644 --- a/lapack-netlib/SRC/sgeqr2p.c +++ b/lapack-netlib/SRC/sgeqr2p.c @@ -648,7 +648,7 @@ l elements using an unblocked algorithm. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqr2p_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqr2p_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -656,11 +656,11 @@ l elements using an unblocked algorithm. */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); real aii; - extern /* Subroutine */ int slarfgp_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfgp_(integer *, real *, real *, integer *, real *); @@ -694,7 +694,7 @@ l elements using an unblocked algorithm. */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQR2P", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -723,7 +723,7 @@ l elements using an unblocked algorithm. */ } /* L10: */ } - return 0; + return; /* End of SGEQR2P */ diff --git a/lapack-netlib/SRC/sgeqrf.c b/lapack-netlib/SRC/sgeqrf.c index 5de1edaed9..61ed86dd8d 100644 --- a/lapack-netlib/SRC/sgeqrf.c +++ b/lapack-netlib/SRC/sgeqrf.c @@ -661,7 +661,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqrf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqrf_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -669,15 +669,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqr2_(integer *, integer *, real *, integer *, real *, real *, integer *); integer ib, nb, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -721,9 +722,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -731,7 +732,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -816,7 +817,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SGEQRF */ diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f index f47d8bf322..b24615f7a1 100644 --- a/lapack-netlib/SRC/sgeqrf.f +++ b/lapack-netlib/SRC/sgeqrf.f @@ -204,7 +204,7 @@ SUBROUTINE SGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO ) END IF * * Quick return if possible -* +* IF( K.EQ.0 ) THEN WORK( 1 ) = 1 RETURN diff --git a/lapack-netlib/SRC/sgeqrfp.c b/lapack-netlib/SRC/sgeqrfp.c index 48f352e4e1..e63aa12b5a 100644 --- a/lapack-netlib/SRC/sgeqrfp.c +++ b/lapack-netlib/SRC/sgeqrfp.c @@ -665,7 +665,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqrfp_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqrfp_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -673,16 +673,17 @@ static integer c__2 = 2; /* Local variables */ integer i__, k, nbmin, iinfo, ib, nb, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; - extern /* Subroutine */ int sgeqr2p_(integer *, integer *, real *, + extern /* Subroutine */ void sgeqr2p_(integer *, integer *, real *, integer *, real *, real *, integer *); integer iws; @@ -724,9 +725,9 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("SGEQRFP", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -734,7 +735,7 @@ static integer c__2 = 2; k = f2cmin(*m,*n); if (k == 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -819,7 +820,7 @@ static integer c__2 = 2; } work[1] = (real) iws; - return 0; + return; /* End of SGEQRFP */ diff --git a/lapack-netlib/SRC/sgeqrt.c b/lapack-netlib/SRC/sgeqrt.c index ba72c05589..44b22b8bb8 100644 --- a/lapack-netlib/SRC/sgeqrt.c +++ b/lapack-netlib/SRC/sgeqrt.c @@ -650,7 +650,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqrt_(integer *m, integer *n, integer *nb, real *a, +/* Subroutine */ void sgeqrt_(integer *m, integer *n, integer *nb, real *a, integer *lda, real *t, integer *ldt, real *work, integer *info) { /* System generated locals */ @@ -658,9 +658,11 @@ f"> */ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen), sgeqrt2_( + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void sgeqrt2_( integer *, integer *, real *, integer *, real *, integer *, integer *), sgeqrt3_(integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -702,14 +704,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -744,7 +746,7 @@ f"> */ ib) * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of SGEQRT */ diff --git a/lapack-netlib/SRC/sgeqrt2.c b/lapack-netlib/SRC/sgeqrt2.c index c229ac8d58..0276162039 100644 --- a/lapack-netlib/SRC/sgeqrt2.c +++ b/lapack-netlib/SRC/sgeqrt2.c @@ -643,21 +643,22 @@ presentation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqrt2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqrt2_(integer *m, integer *n, real *a, integer *lda, real *t, integer *ldt, integer *info) { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, k; real alpha; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, - integer *, real *, integer *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, + integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); real aii; @@ -695,7 +696,7 @@ presentation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQRT2", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -764,6 +765,6 @@ presentation of Q. */ /* End of SGEQRT2 */ - return 0; + return; } /* sgeqrt2_ */ diff --git a/lapack-netlib/SRC/sgeqrt3.c b/lapack-netlib/SRC/sgeqrt3.c index 0dbbe4337f..0de1ac3aba 100644 --- a/lapack-netlib/SRC/sgeqrt3.c +++ b/lapack-netlib/SRC/sgeqrt3.c @@ -648,7 +648,7 @@ ompact WY representation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgeqrt3_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgeqrt3_(integer *m, integer *n, real *a, integer *lda, real *t, integer *ldt, integer *info) { /* System generated locals */ @@ -656,13 +656,15 @@ ompact WY representation of Q. */ /* Local variables */ integer i__, j, iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer i1, j1, n1, n2; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen), slarfg_(integer *, real *, real *, integer *, real *); + ); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -696,7 +698,7 @@ ompact WY representation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("SGEQRT3", &i__1, (ftnlen)7); - return 0; + return; } if (*n == 1) { @@ -794,7 +796,7 @@ ompact WY representation of Q. */ } - return 0; + return; /* End of SGEQRT3 */ diff --git a/lapack-netlib/SRC/sgerfs.c b/lapack-netlib/SRC/sgerfs.c index 7ae85ba960..fd0aa04311 100644 --- a/lapack-netlib/SRC/sgerfs.c +++ b/lapack-netlib/SRC/sgerfs.c @@ -699,7 +699,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgerfs_(char *trans, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sgerfs_(char *trans, integer *n, integer *nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real * work, integer *iwork, integer *info) @@ -716,10 +716,10 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer count; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); @@ -787,7 +787,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGERFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -799,7 +799,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -995,7 +995,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SGERFS */ diff --git a/lapack-netlib/SRC/sgerfsx.c b/lapack-netlib/SRC/sgerfsx.c index 6474ba2631..3cd1abbc49 100644 --- a/lapack-netlib/SRC/sgerfsx.c +++ b/lapack-netlib/SRC/sgerfsx.c @@ -926,7 +926,7 @@ static integer c__1 = 1; /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgerfsx_(char *trans, char *equed, integer *n, integer * +/* Subroutine */ void sgerfsx_(char *trans, char *equed, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, real *r__, real *c__, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *berr, integer *n_err_bnds__, real *err_bnds_norm__, @@ -941,7 +941,7 @@ static integer c__1 = 1; /* Local variables */ real illrcond_thresh__, unstable_thresh__; - extern /* Subroutine */ int sla_gerfsx_extended_(integer *, integer *, + extern /* Subroutine */ void sla_gerfsx_extended_(integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *, logical *, real *, real *, integer *, real *, integer * , real *, integer *, real *, real *, real *, real *, real *, real @@ -957,7 +957,8 @@ static integer c__1 = 1; integer prec_type__; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgecon_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgecon_( char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *); logical colequ, notran, rowequ; @@ -1080,7 +1081,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SGERFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -1103,7 +1104,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.f; } } - return 0; + return; } /* Default to failure. */ @@ -1262,7 +1263,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of SGERFSX */ diff --git a/lapack-netlib/SRC/sgerq2.c b/lapack-netlib/SRC/sgerq2.c index e19297748b..e3943b54f7 100644 --- a/lapack-netlib/SRC/sgerq2.c +++ b/lapack-netlib/SRC/sgerq2.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgerq2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgerq2_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -641,9 +641,10 @@ f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *, ftnlen), slarfg_(integer *, real *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); real aii; @@ -678,7 +679,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGERQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -703,7 +704,7 @@ f"> */ a[*m - k + i__ + (*n - k + i__) * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of SGERQ2 */ diff --git a/lapack-netlib/SRC/sgerqf.c b/lapack-netlib/SRC/sgerqf.c index fbc5b12cef..0058662691 100644 --- a/lapack-netlib/SRC/sgerqf.c +++ b/lapack-netlib/SRC/sgerqf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgerqf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgerqf_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -662,15 +662,16 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int sgerq2_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgerq2_(integer *, integer *, real *, integer *, real *, real *, integer *); integer ib, nb, ki, kk, mu, nu, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -728,15 +729,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGERQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -827,7 +828,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SGERQF */ diff --git a/lapack-netlib/SRC/sgesc2.c b/lapack-netlib/SRC/sgesc2.c index 3d5327986d..62936e05eb 100644 --- a/lapack-netlib/SRC/sgesc2.c +++ b/lapack-netlib/SRC/sgesc2.c @@ -629,7 +629,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int sgesc2_(integer *n, real *a, integer *lda, real *rhs, +/* Subroutine */ void sgesc2_(integer *n, real *a, integer *lda, real *rhs, integer *ipiv, integer *jpiv, real *scale) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Local variables */ real temp; integer i__, j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slabad_(real *, real *); extern real slamch_(char *); real bignum; @@ -720,7 +720,7 @@ f"> */ i__1 = *n - 1; slaswp_(&c__1, &rhs[1], lda, &c__1, &i__1, &jpiv[1], &c_n1); - return 0; + return; /* End of SGESC2 */ diff --git a/lapack-netlib/SRC/sgesdd.c b/lapack-netlib/SRC/sgesdd.c index c92d177c5c..9951ca4bde 100644 --- a/lapack-netlib/SRC/sgesdd.c +++ b/lapack-netlib/SRC/sgesdd.c @@ -734,7 +734,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgesdd_(char *jobz, integer *m, integer *n, real *a, +/* Subroutine */ void sgesdd_(char *jobz, integer *m, integer *n, real *a, integer *lda, real *s, real *u, integer *ldu, real *vt, integer *ldvt, real *work, integer *lwork, integer *iwork, integer *info) { @@ -752,7 +752,7 @@ f"> */ lwork_sormbr_prt_mn__, lwork_sormbr_prt_nn__, i__; extern logical lsame_(char *, char *); integer chunk; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer minmn, wrkbl, itaup, itauq, mnthr; @@ -760,17 +760,17 @@ f"> */ integer nwork; logical wntqn, wntqo, wntqs; integer ie, il, ir, bdspac, iu, lwork_sorgbr_p_mm__; - extern /* Subroutine */ int sbdsdc_(char *, char *, integer *, real *, + extern /* Subroutine */ void sbdsdc_(char *, char *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); integer lwork_sorgbr_q_nn__; - extern /* Subroutine */ int sgebrd_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgebrd_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, integer *); extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgeqrf_(integer *, integer *, real @@ -778,19 +778,19 @@ f"> */ *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); extern logical sisnan_(real *); - extern /* Subroutine */ int sorgbr_(char *, integer *, integer *, integer + extern /* Subroutine */ void sorgbr_(char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ldwrkl; - extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormbr_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , real *, integer *, integer *); integer ldwrkr, minwrk, ldwrku, maxwrk; - extern /* Subroutine */ int sorglq_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorglq_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ldwkvt; real smlnum; logical wntqas; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery; integer blk; @@ -1236,15 +1236,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGESDD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1258,7 +1258,7 @@ f"> */ anrm = slange_("M", m, n, &a[a_offset], lda, dum); if (sisnan_(&anrm)) { *info = -4; - return 0; + return; } iscl = 0; if (anrm > 0.f && anrm < smlnum) { @@ -2281,7 +2281,7 @@ f"> */ work[1] = (real) maxwrk; - return 0; + return; /* End of SGESDD */ diff --git a/lapack-netlib/SRC/sgesv.c b/lapack-netlib/SRC/sgesv.c index dcb46aa2aa..ac0b02e2fc 100644 --- a/lapack-netlib/SRC/sgesv.c +++ b/lapack-netlib/SRC/sgesv.c @@ -639,7 +639,8 @@ e driver) */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgetrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgetrf_( integer *, integer *, real *, integer *, integer *, integer *), sgetrs_(char *, integer *, integer *, real *, integer *, integer * , real *, integer *, integer *); diff --git a/lapack-netlib/SRC/sgesvd.c b/lapack-netlib/SRC/sgesvd.c index b4c5a0adba..eee882dde5 100644 --- a/lapack-netlib/SRC/sgesvd.c +++ b/lapack-netlib/SRC/sgesvd.c @@ -729,7 +729,7 @@ f"> */ /* > \ingroup realGEsing */ /* ===================================================================== */ -/* Subroutine */ int sgesvd_(char *jobu, char *jobvt, integer *m, integer *n, +/* Subroutine */ void sgesvd_(char *jobu, char *jobvt, integer *m, integer *n, real *a, integer *lda, real *s, real *u, integer *ldu, real *vt, integer *ldvt, real *work, integer *lwork, integer *info) { @@ -746,13 +746,13 @@ f"> */ lwork_sgeqrf__, i__; extern logical lsame_(char *, char *); integer chunk; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer minmn, wrkbl, itaup, itauq, mnthr, iwork; logical wntua, wntva, wntun, wntuo, wntvn, wntvo, wntus, wntvs; integer ie, ir, bdspac, iu; - extern /* Subroutine */ int sgebrd_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgebrd_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, integer *); extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); @@ -760,7 +760,7 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgeqrf_(integer *, integer *, real @@ -774,10 +774,10 @@ f"> */ char *, integer *, integer *, integer *, real *, integer *, real * , real *, integer *, real *, integer *, integer *); integer ldwrkr, minwrk, ldwrku, maxwrk; - extern /* Subroutine */ int sorglq_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorglq_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery, wntuas, wntvas; integer blk, lwork_sorgbr_p__, lwork_sorgbr_q__, lwork_sorglq_m__, @@ -1378,15 +1378,15 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("SGESVD", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -4587,7 +4587,7 @@ f"> */ work[1] = (real) maxwrk; - return 0; + return; /* End of SGESVD */ diff --git a/lapack-netlib/SRC/sgesvdq.c b/lapack-netlib/SRC/sgesvdq.c index d97ad108e2..4c6bc8beac 100644 --- a/lapack-netlib/SRC/sgesvdq.c +++ b/lapack-netlib/SRC/sgesvdq.c @@ -932,7 +932,7 @@ static logical c_false = FALSE_; /* > \ingroup realGEsing */ /* ===================================================================== */ -/* Subroutine */ int sgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, +/* Subroutine */ void sgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, char *jobv, integer *m, integer *n, real *a, integer *lda, real *s, real *u, integer *ldu, real *v, integer *ldv, integer *numrank, integer *iwork, integer *liwork, real *work, integer *lwork, real * @@ -954,7 +954,7 @@ static logical c_false = FALSE_; integer p, q; logical conda; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer iwoff; logical lsvec; real sfmin, epsln; @@ -964,18 +964,19 @@ static logical c_false = FALSE_; logical dntwu, dntwv, wntua; integer lworq; logical wntuf, wntva, wntur, wntus, wntvr; - extern /* Subroutine */ int sgeqp3_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqp3_(integer *, integer *, real *, integer *, integer *, real *, real *, integer *, integer *); integer lwsvd2, lworq2, nr; real sconda; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgelqf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgelqf_( integer *, integer *, real *, integer *, real *, real *, integer * , integer *), slascl_(char *, integer *, integer *, real *, real * , integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), sgesvd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer * , real *, integer *, real *, integer *, integer *) @@ -990,12 +991,12 @@ static logical c_false = FALSE_; extern /* Subroutine */ int slaswp_(integer *, real *, integer *, integer *, integer *, integer *, integer *); real rdummy[1]; - extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); logical lquery; integer lwunlq; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer optwrk; @@ -1376,7 +1377,7 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); xerbla_("SGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { /* Return optimal workspace */ @@ -1385,13 +1386,13 @@ static logical c_false = FALSE_; work[1] = (real) optwrk; work[2] = (real) minwrk; rwork[1] = (real) rminwrk; - return 0; + return; } /* Quick return if the matrix is void. */ if (*m == 0 || *n == 0) { - return 0; + return; } big = slamch_("O"); @@ -1410,7 +1411,7 @@ static logical c_false = FALSE_; *info = -8; i__2 = -(*info); xerbla_("SGESVDQ", &i__2, (ftnlen)7); - return 0; + return; } /* L1904: */ } @@ -1461,7 +1462,7 @@ static logical c_false = FALSE_; rwork[1] = -1.f; } rwork[2] = -1.f; - return 0; + return; } if (rwork[1] > big / sqrt((real) (*m))) { @@ -1485,7 +1486,7 @@ static logical c_false = FALSE_; *info = -8; i__1 = -(*info); xerbla_("SGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } if (rtmp > big / sqrt((real) (*m))) { /* matrix by 1/sqrt(M) if too large entry detected */ @@ -2240,7 +2241,7 @@ static logical c_false = FALSE_; /* full row rank triangular (trapezoidal) factor of A. */ *numrank = nr; - return 0; + return; /* End of SGESVDQ */ diff --git a/lapack-netlib/SRC/sgesvdx.c b/lapack-netlib/SRC/sgesvdx.c index a90e8ba076..1417b1e80b 100644 --- a/lapack-netlib/SRC/sgesvdx.c +++ b/lapack-netlib/SRC/sgesvdx.c @@ -779,7 +779,7 @@ static real c_b109 = 0.f; /* > \ingroup realGEsing */ /* ===================================================================== */ -/* Subroutine */ int sgesvdx_(char *jobu, char *jobvt, char *range, integer * +/* Subroutine */ void sgesvdx_(char *jobu, char *jobvt, char *range, integer * m, integer *n, real *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, integer *ns, real *s, real *u, integer *ldu, real *vt, integer *ldvt, real *work, integer *lwork, integer *iwork, integer * @@ -802,11 +802,11 @@ static real c_b109 = 0.f; integer i__, j; extern logical lsame_(char *, char *); integer iltgk, itemp, minmn, itaup, itauq, iutgk, itgkz, mnthr; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantu; integer id, ie; - extern /* Subroutine */ int sgebrd_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgebrd_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, integer *); extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); @@ -814,30 +814,30 @@ static real c_b109 = 0.f; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); real bignum; - extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgelqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); real abstol; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); char rngtgk[1]; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *), sormbr_(char *, char *, char * , integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer minwrk, maxwrk; real smlnum; - extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); logical lquery, wantvt; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real dum[1], eps; - extern /* Subroutine */ int sbdsvdx_(char *, char *, char *, integer *, + extern /* Subroutine */ void sbdsvdx_(char *, char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -1065,15 +1065,15 @@ static real c_b109 = 0.f; if (*info != 0) { i__2 = -(*info); xerbla_("SGESVDX", &i__2, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set singular values indices accord to RANGE. */ @@ -1455,7 +1455,7 @@ static real c_b109 = 0.f; work[1] = (real) maxwrk; - return 0; + return; /* End of SGESVDX */ diff --git a/lapack-netlib/SRC/sgesvj.c b/lapack-netlib/SRC/sgesvj.c index 38a14baf30..ab5fa21caf 100644 --- a/lapack-netlib/SRC/sgesvj.c +++ b/lapack-netlib/SRC/sgesvj.c @@ -839,7 +839,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* ===================================================================== */ -/* Subroutine */ int sgesvj_(char *joba, char *jobu, char *jobv, integer *m, +/* Subroutine */ void sgesvj_(char *joba, char *jobu, char *jobv, integer *m, integer *n, real *a, integer *lda, real *sva, integer *mv, real *v, integer *ldv, real *work, integer *lwork, integer *info) { @@ -859,23 +859,23 @@ f"> */ real t, large, apoaq, aqoap; extern logical lsame_(char *, char *); real theta; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real small, sfmin; logical lsvec; real fastr[5], epsln; logical applv, rsvec, uctol, lower, upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical rotok; integer n2; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer n4; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), srotm_(integer *, real *, integer *, real *, integer *, real *); real rootsfmin; - extern /* Subroutine */ int sgsvj0_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgsvj0_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *), sgsvj1_(char *, integer *, integer *, integer *, real *, integer * @@ -886,15 +886,15 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); integer blskip; real mxaapq; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real thsign; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real mxsinj; integer ir1, emptsw, notrot, iswrot, jbc; @@ -978,13 +978,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGESVJ", &i__1, (ftnlen)6); - return 0; + return; } /* #:) Quick return for void matrix */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set numerical parameters */ @@ -1026,7 +1026,7 @@ f"> */ *info = -4; i__1 = -(*info); xerbla_("SGESVJ", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize the right singular vector matrix. */ @@ -1064,7 +1064,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("SGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1094,7 +1094,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("SGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1124,7 +1124,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("SGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1180,7 +1180,7 @@ f"> */ work[4] = 0.f; work[5] = 0.f; work[6] = 0.f; - return 0; + return; } /* #:) Quick return for one-column matrix */ @@ -1200,7 +1200,7 @@ f"> */ work[4] = 0.f; work[5] = 0.f; work[6] = 0.f; - return 0; + return; } /* Protect small singular values from underflow, and try to */ @@ -2329,6 +2329,6 @@ f"> */ /* MXSINJ is the largest absolute value of the sines of Jacobi angles */ /* in the last sweep */ - return 0; + return; } /* sgesvj_ */ diff --git a/lapack-netlib/SRC/sgesvx.c b/lapack-netlib/SRC/sgesvx.c index dbeaffe7c2..d6f90ee676 100644 --- a/lapack-netlib/SRC/sgesvx.c +++ b/lapack-netlib/SRC/sgesvx.c @@ -856,7 +856,7 @@ f"> */ /* > \ingroup realGEsolve */ /* ===================================================================== */ -/* Subroutine */ int sgesvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void sgesvx_(char *fact, char *trans, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, char *equed, real *r__, real *c__, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, @@ -878,21 +878,23 @@ f"> */ extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); logical nofact; - extern /* Subroutine */ int slaqge_(integer *, integer *, real *, integer - *, real *, real *, real *, real *, real *, char *), - xerbla_(char *, integer *, ftnlen), sgecon_(char *, integer *, + extern /* Subroutine */ void slaqge_(integer *, integer *, real *, integer + *, real *, real *, real *, real *, real *, char *); + extern int xerbla_(char *, integer *, ftnlen); + extern void sgecon_(char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *); real bignum; integer infequ; logical colequ; - extern /* Subroutine */ int sgeequ_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeequ_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *), sgerfs_( char *, integer *, integer *, real *, integer *, real *, integer * , integer *, real *, integer *, real *, integer *, real *, real *, - real *, integer *, integer *), sgetrf_(integer *, + real *, integer *, integer *); + extern int sgetrf_(integer *, integer *, real *, integer *, integer *, integer *); real rowcnd; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical notran; extern real slantr_(char *, char *, char *, integer *, integer *, real *, @@ -1025,7 +1027,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGESVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1096,7 +1098,7 @@ f"> */ } work[1] = rpvgrw; *rcond = 0.f; - return 0; + return; } } @@ -1179,7 +1181,7 @@ f"> */ } work[1] = rpvgrw; - return 0; + return; /* End of SGESVX */ diff --git a/lapack-netlib/SRC/sgesvxx.c b/lapack-netlib/SRC/sgesvxx.c index 2a1b6f21ad..424053793e 100644 --- a/lapack-netlib/SRC/sgesvxx.c +++ b/lapack-netlib/SRC/sgesvxx.c @@ -1048,7 +1048,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realGEsolve */ /* ===================================================================== */ -/* Subroutine */ int sgesvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void sgesvxx_(char *fact, char *trans, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, char *equed, real *r__, real *c__, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, integer * @@ -1072,23 +1072,23 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ real colcnd; extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int slaqge_(integer *, integer *, real *, integer - *, real *, real *, real *, real *, real *, char *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void slaqge_(integer *, integer *, real *, integer + *, real *, real *, real *, real *, real *, char *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; logical colequ; - extern /* Subroutine */ int sgetrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgetrf_(integer *, integer *, real *, integer *, integer *, integer *); real rowcnd; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical notran; - extern /* Subroutine */ int sgetrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgetrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real smlnum; logical rowequ; - extern /* Subroutine */ int slascl2_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slascl2_(integer *, integer *, real *, real *, integer *), sgeequb_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *), sgerfsx_(char *, char *, integer *, integer *, real *, integer *, real *, @@ -1230,7 +1230,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGESVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1296,7 +1296,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = sla_gerpvgrw_(n, info, &a[a_offset], lda, &af[ af_offset], ldaf); - return 0; + return; } } @@ -1327,7 +1327,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ slascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of SGESVXX */ } /* sgesvxx_ */ diff --git a/lapack-netlib/SRC/sgetc2.c b/lapack-netlib/SRC/sgetc2.c index d0db97c114..1ed5eb4be4 100644 --- a/lapack-netlib/SRC/sgetc2.c +++ b/lapack-netlib/SRC/sgetc2.c @@ -625,7 +625,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int sgetc2_(integer *n, real *a, integer *lda, integer *ipiv, +/* Subroutine */ void sgetc2_(integer *n, real *a, integer *lda, integer *ipiv, integer *jpiv, integer *info) { /* System generated locals */ @@ -633,11 +633,11 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real smin, xmax; integer i__, j; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), slabad_(real *, real *); integer ip, jp; extern real slamch_(char *); @@ -667,7 +667,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -686,7 +686,7 @@ f"> */ *info = 1; a[a_dim1 + 1] = smlnum; } - return 0; + return; } /* Factorize A using complete pivoting. */ @@ -761,7 +761,7 @@ f"> */ ipiv[*n] = *n; jpiv[*n] = *n; - return 0; + return; /* End of SGETC2 */ diff --git a/lapack-netlib/SRC/sgetf2.c b/lapack-netlib/SRC/sgetf2.c index 556792a775..60604637de 100644 --- a/lapack-netlib/SRC/sgetf2.c +++ b/lapack-netlib/SRC/sgetf2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgetf2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgetf2_(integer *m, integer *n, real *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -631,12 +631,12 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real sfmin; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer jp; extern real slamch_(char *); @@ -673,13 +673,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGETF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Compute machine safe minimum */ @@ -734,7 +734,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of SGETF2 */ diff --git a/lapack-netlib/SRC/sgetrf.c b/lapack-netlib/SRC/sgetrf.c index be631190de..efe00eb33a 100644 --- a/lapack-netlib/SRC/sgetrf.c +++ b/lapack-netlib/SRC/sgetrf.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgetrf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgetrf_(integer *m, integer *n, real *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -632,7 +632,7 @@ f"> */ /* Local variables */ integer i__, j, iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, @@ -641,7 +641,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slaswp_(integer *, real *, integer *, integer + extern /* Subroutine */ void slaswp_(integer *, real *, integer *, integer *, integer *, integer *, integer *), sgetrf2_(integer *, integer * , real *, integer *, integer *, integer *); @@ -675,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGETRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -759,7 +759,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of SGETRF */ diff --git a/lapack-netlib/SRC/sgetrf2.c b/lapack-netlib/SRC/sgetrf2.c index a5bddfcfb3..9d06e41c5a 100644 --- a/lapack-netlib/SRC/sgetrf2.c +++ b/lapack-netlib/SRC/sgetrf2.c @@ -625,7 +625,7 @@ static real c_b16 = -1.f; /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgetrf2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void sgetrf2_(integer *m, integer *n, real *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -635,12 +635,12 @@ static real c_b16 = -1.f; /* Local variables */ real temp; integer i__, iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real sfmin; integer n1, n2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); extern real slamch_(char *); @@ -679,13 +679,13 @@ static real c_b16 = -1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SGETRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (*m == 1) { @@ -797,7 +797,7 @@ static real c_b16 = -1.f; slaswp_(&n1, &a[a_dim1 + 1], lda, &i__1, &i__2, &ipiv[1], &c__1); } - return 0; + return; /* End of SGETRF2 */ diff --git a/lapack-netlib/SRC/sgetri.c b/lapack-netlib/SRC/sgetri.c index 51b62f6bd5..04e19e6eb1 100644 --- a/lapack-netlib/SRC/sgetri.c +++ b/lapack-netlib/SRC/sgetri.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgetri_(integer *n, real *a, integer *lda, integer *ipiv, +/* Subroutine */ void sgetri_(integer *n, real *a, integer *lda, integer *ipiv, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Local variables */ integer i__, j, nbmin; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, @@ -693,15 +693,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGETRI", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form inv(U). If INFO > 0 from STRTRI, then U is singular, */ @@ -709,7 +709,7 @@ f"> */ strtri_("Upper", "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } nbmin = 2; @@ -806,7 +806,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SGETRI */ diff --git a/lapack-netlib/SRC/sgetrs.c b/lapack-netlib/SRC/sgetrs.c index 8df46a39c6..6740b33831 100644 --- a/lapack-netlib/SRC/sgetrs.c +++ b/lapack-netlib/SRC/sgetrs.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgetrs_(char *trans, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sgetrs_(char *trans, integer *n, integer *nrhs, real *a, integer *lda, integer *ipiv, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -644,11 +644,12 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical notran; - extern /* Subroutine */ int slaswp_(integer *, real *, integer *, integer + extern /* Subroutine */ void slaswp_(integer *, real *, integer *, integer *, integer *, integer *, integer *); @@ -690,13 +691,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGETRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (notran) { @@ -735,7 +736,7 @@ f"> */ slaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1); } - return 0; + return; /* End of SGETRS */ diff --git a/lapack-netlib/SRC/sgetsls.c b/lapack-netlib/SRC/sgetsls.c index a1a2fb344c..bd1230bfab 100644 --- a/lapack-netlib/SRC/sgetsls.c +++ b/lapack-netlib/SRC/sgetsls.c @@ -674,7 +674,7 @@ static integer c__0 = 0; /* > \ingroup realGEsolve */ /* ===================================================================== */ -/* Subroutine */ int sgetsls_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void sgetsls_(char *trans, integer *m, integer *n, integer * nrhs, real *a, integer *lda, real *b, integer *ldb, real *work, integer *lwork, integer *info) { @@ -686,20 +686,20 @@ static integer c__0 = 0; logical tran; integer brow, tszm, tszo, info2, i__, j, iascl, ibscl; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgelq_(integer *, integer *, real *, integer * + extern /* Subroutine */ void sgelq_(integer *, integer *, real *, integer * , real *, integer *, real *, integer *, integer *); integer minmn, maxmn; - extern /* Subroutine */ int sgeqr_(integer *, integer *, real *, integer * + extern /* Subroutine */ void sgeqr_(integer *, integer *, real *, integer * , real *, integer *, real *, integer *, integer *); real workq[1]; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real tq[5]; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer scllen; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sgemlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *), slaset_(char *, integer *, @@ -817,7 +817,7 @@ static integer c__0 = 0; i__1 = -(*info); xerbla_("SGETSLS", &i__1, (ftnlen)7); work[1] = (real) wsizeo; - return 0; + return; } if (lquery) { if (*lwork == -1) { @@ -826,7 +826,7 @@ static integer c__0 = 0; if (*lwork == -2) { work[1] = (real) wsizem; } - return 0; + return; } if (*lwork < wsizeo) { lw1 = tszm; @@ -843,7 +843,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); slaset_("FULL", &i__1, nrhs, &c_b23, &c_b23, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -920,7 +920,7 @@ static integer c__0 = 0; strtrs_("U", "N", "N", n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; } else { @@ -933,7 +933,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = ZERO */ @@ -976,7 +976,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1017,7 +1017,7 @@ static integer c__0 = 0; lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1045,7 +1045,7 @@ static integer c__0 = 0; L50: work[1] = (real) (tszo + lwo); - return 0; + return; /* End of SGETSLS */ diff --git a/lapack-netlib/SRC/sgetsqrhrt.c b/lapack-netlib/SRC/sgetsqrhrt.c index dee18c1344..5f0a942291 100644 --- a/lapack-netlib/SRC/sgetsqrhrt.c +++ b/lapack-netlib/SRC/sgetsqrhrt.c @@ -689,7 +689,7 @@ hrt.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgetsqrhrt_(integer *m, integer *n, integer *mb1, +/* Subroutine */ void sgetsqrhrt_(integer *m, integer *n, integer *mb1, integer *nb1, integer *nb2, real *a, integer *lda, real *t, integer * ldt, real *work, integer *lwork, integer *info) { @@ -699,15 +699,15 @@ hrt.f"> */ /* Local variables */ integer ldwt, lworkopt, i__, j, iinfo; - extern /* Subroutine */ int sorgtsqr_row_(integer *, integer *, integer * + extern /* Subroutine */ void sorgtsqr_row_(integer *, integer *, integer * , integer *, real *, integer *, real *, integer *, real *, integer *, integer *), scopy_(integer *, real *, integer *, real * , integer *), sorhr_col_(integer *, integer *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, - integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical lquery; integer lw1, lw2, num_all_row_blocks__, lwt; - extern /* Subroutine */ int slatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void slatsqr_(integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , integer *); integer nb1local, nb2local; @@ -809,17 +809,17 @@ hrt.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGETSQRHRT", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { work[1] = (real) lworkopt; - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { work[1] = (real) lworkopt; - return 0; + return; } nb2local = f2cmin(*nb2,*n); @@ -881,7 +881,7 @@ hrt.f"> */ } work[1] = (real) lworkopt; - return 0; + return; /* End of SGETSQRHRT */ diff --git a/lapack-netlib/SRC/sggbak.c b/lapack-netlib/SRC/sggbak.c index 508f5d912b..2fb93ae731 100644 --- a/lapack-netlib/SRC/sggbak.c +++ b/lapack-netlib/SRC/sggbak.c @@ -655,7 +655,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggbak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void sggbak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, real *lscale, real *rscale, integer *m, real *v, integer *ldv, integer *info) { @@ -665,10 +665,11 @@ f"> */ /* Local variables */ integer i__, k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical leftv; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, - integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); logical rightv; @@ -718,19 +719,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -836,7 +837,7 @@ f"> */ L110: - return 0; + return; /* End of SGGBAK */ diff --git a/lapack-netlib/SRC/sggbak.f b/lapack-netlib/SRC/sggbak.f index bb7f360112..8a796fdb1b 100644 --- a/lapack-netlib/SRC/sggbak.f +++ b/lapack-netlib/SRC/sggbak.f @@ -252,7 +252,7 @@ SUBROUTINE SGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, $ GO TO 50 * DO 40 I = ILO - 1, 1, -1 - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -262,7 +262,7 @@ SUBROUTINE SGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( IHI.EQ.N ) $ GO TO 70 DO 60 I = IHI + 1, N - K = RSCALE( I ) + K = INT( RSCALE( I ) ) IF( K.EQ.I ) $ GO TO 60 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -276,7 +276,7 @@ SUBROUTINE SGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( ILO.EQ.1 ) $ GO TO 90 DO 80 I = ILO - 1, 1, -1 - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 80 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -286,7 +286,7 @@ SUBROUTINE SGGBAK( JOB, SIDE, N, ILO, IHI, LSCALE, RSCALE, M, V, IF( IHI.EQ.N ) $ GO TO 110 DO 100 I = IHI + 1, N - K = LSCALE( I ) + K = INT( LSCALE( I ) ) IF( K.EQ.I ) $ GO TO 100 CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/sggbal.c b/lapack-netlib/SRC/sggbal.c index fff51160e2..b572dd0dfd 100644 --- a/lapack-netlib/SRC/sggbal.c +++ b/lapack-netlib/SRC/sggbal.c @@ -691,7 +691,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggbal_(char *job, integer *n, real *a, integer *lda, +/* Subroutine */ void sggbal_(char *job, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *ilo, integer *ihi, real *lscale, real *rscale, real *work, integer *info) { @@ -709,13 +709,13 @@ f"> */ integer i__, j, k, l, m; real gamma, t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real sfmin, sfmax; integer iflow; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer kount; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); integer jc; real ta, tb, tc; @@ -768,7 +768,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGBAL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -776,7 +776,7 @@ f"> */ if (*n == 0) { *ilo = 1; *ihi = *n; - return 0; + return; } if (*n == 1) { @@ -784,7 +784,7 @@ f"> */ *ihi = *n; lscale[1] = 1.f; rscale[1] = 1.f; - return 0; + return; } if (lsame_(job, "N")) { @@ -796,7 +796,7 @@ f"> */ rscale[i__] = 1.f; /* L10: */ } - return 0; + return; } k = 1; @@ -929,11 +929,11 @@ f"> */ rscale[i__] = 1.f; /* L195: */ } - return 0; + return; } if (*ilo == *ihi) { - return 0; + return; } /* Balance the submatrix in rows ILO to IHI. */ @@ -1185,7 +1185,7 @@ f"> */ /* L380: */ } - return 0; + return; /* End of SGGBAL */ diff --git a/lapack-netlib/SRC/sggbal.f b/lapack-netlib/SRC/sggbal.f index 6cfdbcdba7..d7a8ef16cf 100644 --- a/lapack-netlib/SRC/sggbal.f +++ b/lapack-netlib/SRC/sggbal.f @@ -522,7 +522,7 @@ SUBROUTINE SGGBAL( JOB, N, A, LDA, B, LDB, ILO, IHI, LSCALE, IRAB = ISAMAX( N-ILO+1, B( I, ILO ), LDB ) RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) ) LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE ) - IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) + IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) ) IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB ) LSCALE( I ) = SCLFAC**IR ICAB = ISAMAX( IHI, A( 1, I ), 1 ) @@ -530,7 +530,7 @@ SUBROUTINE SGGBAL( JOB, N, A, LDA, B, LDB, ILO, IHI, LSCALE, ICAB = ISAMAX( IHI, B( 1, I ), 1 ) CAB = MAX( CAB, ABS( B( ICAB, I ) ) ) LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE ) - JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) + JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) ) JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB ) RSCALE( I ) = SCLFAC**JC 360 CONTINUE diff --git a/lapack-netlib/SRC/sgges.c b/lapack-netlib/SRC/sgges.c index 616716a7ac..e0ae7acc5f 100644 --- a/lapack-netlib/SRC/sgges.c +++ b/lapack-netlib/SRC/sgges.c @@ -798,7 +798,7 @@ or GE matrices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgges_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork, @@ -819,9 +819,9 @@ or GE matrices */ logical cursl, ilvsl, ilvsr; integer irows; logical lst2sl; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ip; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -830,26 +830,26 @@ or GE matrices */ extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); real safmin; - extern /* Subroutine */ int sgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghrd_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, integer *); real safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvl, iright; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real anrmto, bnrmto; logical lastsl; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *), stgsen_(integer *, @@ -859,10 +859,10 @@ or GE matrices */ integer *, integer *, integer *, integer *); integer minwrk, maxwrk; real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical wantst, lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real dif[2]; @@ -985,16 +985,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGES ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1278,7 +1278,7 @@ or GE matrices */ work[1] = (real) maxwrk; - return 0; + return; /* End of SGGES */ diff --git a/lapack-netlib/SRC/sgges3.c b/lapack-netlib/SRC/sgges3.c index 5d5a5128c0..bfc7dedcd8 100644 --- a/lapack-netlib/SRC/sgges3.c +++ b/lapack-netlib/SRC/sgges3.c @@ -796,7 +796,7 @@ f"> */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork, @@ -816,14 +816,14 @@ f"> */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irows; - extern /* Subroutine */ int sgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghd3_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *, integer *) ; logical lst2sl; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ip; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -834,18 +834,18 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real safmax, bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer ijobvl, iright; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real anrmto, bnrmto; logical lastsl; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *), stgsen_(integer *, @@ -854,11 +854,11 @@ f"> */ real *, integer *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, integer *); real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical wantst, lquery; integer lwkopt; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real dif[2]; @@ -992,16 +992,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGES3 ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1279,7 +1279,7 @@ f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SGGES3 */ diff --git a/lapack-netlib/SRC/sggesx.c b/lapack-netlib/SRC/sggesx.c index c8f85759d6..1023a1cc4d 100644 --- a/lapack-netlib/SRC/sggesx.c +++ b/lapack-netlib/SRC/sggesx.c @@ -878,7 +878,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, char *sense, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *rconde, @@ -899,10 +899,10 @@ f"> */ logical cursl, ilvsl, ilvsr; integer irows; logical lst2sl; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ip; real pl; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -912,21 +912,21 @@ f"> */ extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); real safmin; - extern /* Subroutine */ int sgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghrd_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, integer *); real safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvl, iright; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical wantsb, wantse, lastsl; integer liwmin; @@ -934,7 +934,7 @@ f"> */ integer minwrk, maxwrk; logical wantsn; real smlnum; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *), slaset_(char *, @@ -946,7 +946,7 @@ f"> */ integer *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, integer *); logical wantst, lquery, wantsv; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real dif[2]; @@ -1103,16 +1103,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1424,7 +1424,7 @@ f"> */ work[1] = (real) maxwrk; iwork[1] = liwmin; - return 0; + return; /* End of SGGESX */ diff --git a/lapack-netlib/SRC/sggev.c b/lapack-netlib/SRC/sggev.c index 2303f127ba..658736c6b2 100644 --- a/lapack-netlib/SRC/sggev.c +++ b/lapack-netlib/SRC/sggev.c @@ -743,7 +743,7 @@ ices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sggev_(char *jobvl, char *jobvr, integer *n, real *a, +/* Subroutine */ void sggev_(char *jobvl, char *jobvr, integer *n, real *a, integer *lda, real *b, integer *ldb, real *alphar, real *alphai, real *beta, real *vl, integer *ldvl, real *vr, integer *ldvr, real *work, integer *lwork, integer *info) @@ -761,9 +761,9 @@ ices */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irows, jc; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer in, jr; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -771,38 +771,39 @@ ices */ logical ilascl, ilbscl; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgghrd_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgghrd_( char *, char *, integer *, integer *, integer *, real *, integer * , real *, integer *, real *, integer *, real *, integer *, integer *); logical ldumma[1]; char chtemp[1]; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ijobvl, iright; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), stgevc_( char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *); real anrmto, bnrmto; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *); integer minwrk, maxwrk; real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer ihi, ilo; @@ -919,15 +920,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1215,7 +1216,7 @@ ices */ } work[1] = (real) maxwrk; - return 0; + return; /* End of SGGEV */ diff --git a/lapack-netlib/SRC/sggev3.c b/lapack-netlib/SRC/sggev3.c index f27742f335..013b22b2c3 100644 --- a/lapack-netlib/SRC/sggev3.c +++ b/lapack-netlib/SRC/sggev3.c @@ -742,7 +742,7 @@ f"> */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ int sggev3_(char *jobvl, char *jobvr, integer *n, real *a, +/* Subroutine */ void sggev3_(char *jobvl, char *jobvr, integer *n, real *a, integer *lda, real *b, integer *ldb, real *alphar, real *alphai, real *beta, real *vl, integer *ldvl, real *vr, integer *ldvr, real *work, integer *lwork, integer *info) @@ -760,14 +760,14 @@ f"> */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irows; - extern /* Subroutine */ int sgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghd3_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *, integer *) ; integer jc; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer in, jr; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, @@ -779,29 +779,29 @@ f"> */ logical ldumma[1]; char chtemp[1]; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer ijobvl, iright; - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), stgevc_( char *, char *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *); real anrmto, bnrmto; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer ihi, ilo; @@ -936,15 +936,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGEV3 ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1227,7 +1227,7 @@ f"> */ } work[1] = (real) lwkopt; - return 0; + return; /* End of SGGEV3 */ diff --git a/lapack-netlib/SRC/sggevx.c b/lapack-netlib/SRC/sggevx.c index 21ebc26bac..b87e2c5621 100644 --- a/lapack-netlib/SRC/sggevx.c +++ b/lapack-netlib/SRC/sggevx.c @@ -905,7 +905,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void sggevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, real *a, integer *lda, real *b, integer *ldb, real *alphar, real *alphai, real *beta, real *vl, integer *ldvl, real *vr, integer *ldvr, integer *ilo, integer *ihi, real *lscale, real *rscale, @@ -928,22 +928,23 @@ f"> */ integer icols; logical noscl; integer irows, jc; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer in, mm, jr; - extern /* Subroutine */ int sggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sggbak_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, integer * ), sggbal_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, real *, real *, integer *); logical ilascl, ilbscl; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgghrd_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgghrd_( char *, char *, integer *, integer *, integer *, real *, integer * , real *, integer *, real *, integer *, real *, integer *, integer *); logical ldumma[1]; char chtemp[1]; real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -951,18 +952,18 @@ f"> */ integer ijobvl; extern real slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical wantsb; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real anrmto; logical wantse; real bnrmto; - extern /* Subroutine */ int shgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void shgeqz_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *), stgevc_(char *, @@ -975,10 +976,10 @@ f"> */ integer minwrk, maxwrk; logical wantsn; real smlnum; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery, wantsv; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real eps; @@ -1124,15 +1125,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } @@ -1501,7 +1502,7 @@ f"> */ } work[1] = (real) maxwrk; - return 0; + return; /* End of SGGEVX */ diff --git a/lapack-netlib/SRC/sggglm.c b/lapack-netlib/SRC/sggglm.c index 3b42e777dd..21a18ea13d 100644 --- a/lapack-netlib/SRC/sggglm.c +++ b/lapack-netlib/SRC/sggglm.c @@ -700,7 +700,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sggglm_(integer *n, integer *m, integer *p, real *a, +/* Subroutine */ void sggglm_(integer *n, integer *m, integer *p, real *a, integer *lda, real *b, integer *ldb, real *d__, real *x, real *y, real *work, integer *lwork, integer *info) { @@ -709,23 +709,23 @@ f"> */ /* Local variables */ integer lopt, i__; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *); integer nb, np; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sggqrf_(integer *, integer *, integer *, real + extern /* Subroutine */ void sggqrf_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, real *, integer * , integer *); integer lwkmin, nb1, nb2, nb3, nb4, lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real * - , integer *, real *, integer *, integer *), - strtrs_(char *, char *, char *, integer *, integer *, real *, + , integer *, real *, integer *, integer *); + extern int strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -799,9 +799,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGGLM", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -815,7 +815,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.f; } - return 0; + return; } /* Compute the GQR factorization of matrices A and B: */ @@ -854,7 +854,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } i__1 = *n - *m; @@ -883,7 +883,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Copy D to X */ @@ -903,7 +903,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[*m + np + 1]; work[1] = (real) (*m + np + f2cmax(i__1,i__2)); - return 0; + return; /* End of SGGGLM */ diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index bbd032beb6..56b4dba526 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -288,7 +288,7 @@ SUBROUTINE SGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, * CALL SGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = WORK( M+NP+1 ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**T*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/sgghd3.c b/lapack-netlib/SRC/sgghd3.c index 412c8c6eae..fe6c927916 100644 --- a/lapack-netlib/SRC/sgghd3.c +++ b/lapack-netlib/SRC/sgghd3.c @@ -748,7 +748,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgghd3_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void sgghd3_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, real *a, integer *lda, real *b, integer *ldb, real *q, integer *ldq, real *z__, integer *ldz, real *work, integer *lwork, integer *info) @@ -763,14 +763,14 @@ f"> */ integer cola, jcol, ierr; real temp; integer jrow, topq, ppwo; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real temp1, temp2, temp3, c__; integer kacc22, i__, j, k; real s; extern logical lsame_(char *, char *); integer nbmin; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, @@ -778,21 +778,21 @@ f"> */ integer nblst; logical initq; real c1, c2; - extern /* Subroutine */ int sorm22_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorm22_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , integer *, integer *); logical wantq; integer j0; logical initz, wantz; real s1, s2; - extern /* Subroutine */ int strmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); char compq2[1], compz2[1]; integer nb, jj, nh, nx, pw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgghrd_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), @@ -868,9 +868,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGHD3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -895,7 +895,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1] = 1.f; - return 0; + return; } /* Determine the blocksize. */ @@ -1568,7 +1568,7 @@ f"> */ } work[1] = (real) lwkopt; - return 0; + return; /* End of SGGHD3 */ diff --git a/lapack-netlib/SRC/sgghrd.c b/lapack-netlib/SRC/sgghrd.c index bc8721c4c9..b0aa20f97d 100644 --- a/lapack-netlib/SRC/sgghrd.c +++ b/lapack-netlib/SRC/sgghrd.c @@ -721,7 +721,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sgghrd_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void sgghrd_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, real *a, integer *lda, real *b, integer *ldb, real *q, integer *ldq, real *z__, integer *ldz, integer *info) { @@ -733,13 +733,13 @@ f"> */ integer jcol; real temp; integer jrow; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real c__, s; extern logical lsame_(char *, char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer icompq; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slartg_(real *, real *, real * , real *, real *); integer icompz; @@ -825,7 +825,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGHRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -840,7 +840,7 @@ f"> */ /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Zero out lower triangle of B */ @@ -900,7 +900,7 @@ f"> */ /* L40: */ } - return 0; + return; /* End of SGGHRD */ diff --git a/lapack-netlib/SRC/sgglse.c b/lapack-netlib/SRC/sgglse.c index 1602836a38..6b21f3bfaf 100644 --- a/lapack-netlib/SRC/sgglse.c +++ b/lapack-netlib/SRC/sgglse.c @@ -695,7 +695,7 @@ f"> */ /* > \ingroup realOTHERsolve */ /* ===================================================================== */ -/* Subroutine */ int sgglse_(integer *m, integer *n, integer *p, real *a, +/* Subroutine */ void sgglse_(integer *m, integer *n, integer *p, real *a, integer *lda, real *b, integer *ldb, real *c__, real *d__, real *x, real *work, integer *lwork, integer *info) { @@ -704,7 +704,7 @@ f"> */ /* Local variables */ integer lopt; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real @@ -713,17 +713,17 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sggrqf_(integer *, integer *, integer *, real + extern /* Subroutine */ void sggrqf_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, real *, integer * , integer *); integer lwkmin, nb1, nb2, nb3, nb4, lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real * - , integer *, real *, integer *, integer *), - strtrs_(char *, char *, char *, integer *, integer *, real *, + , integer *, real *, integer *, integer *); + extern int strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -797,15 +797,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGLSE", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the GRQ factorization of matrices B and A: */ @@ -841,7 +841,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } /* Put the solution in X */ @@ -865,7 +865,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Put the solutions in X */ @@ -902,7 +902,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[*p + mn + 1]; work[1] = (real) (*p + mn + f2cmax(i__1,i__2)); - return 0; + return; /* End of SGGLSE */ diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f index 7ef8782b01..59addc3f47 100644 --- a/lapack-netlib/SRC/sgglse.f +++ b/lapack-netlib/SRC/sgglse.f @@ -276,7 +276,7 @@ SUBROUTINE SGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, * CALL SGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = WORK( P+MN+1 ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**T *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/sggqrf.c b/lapack-netlib/SRC/sggqrf.c index 2c683ad0f1..34f91595e3 100644 --- a/lapack-netlib/SRC/sggqrf.c +++ b/lapack-netlib/SRC/sggqrf.c @@ -728,7 +728,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggqrf_(integer *n, integer *m, integer *p, real *a, +/* Subroutine */ void sggqrf_(integer *n, integer *m, integer *p, real *a, integer *lda, real *taua, real *b, integer *ldb, real *taub, real * work, integer *lwork, integer *info) { @@ -740,13 +740,13 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), sgerqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer * ); integer nb1, nb2, nb3, lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -808,9 +808,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* QR factorization of N-by-M matrix A: A = Q*R */ @@ -834,7 +834,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[1]; work[1] = (real) f2cmax(i__1,i__2); - return 0; + return; /* End of SGGQRF */ diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f index c57b16a563..59b498da56 100644 --- a/lapack-netlib/SRC/sggqrf.f +++ b/lapack-netlib/SRC/sggqrf.f @@ -276,7 +276,7 @@ SUBROUTINE SGGQRF( N, M, P, A, LDA, TAUA, B, LDB, TAUB, WORK, * QR factorization of N-by-M matrix A: A = Q*R * CALL SGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**T*B. * diff --git a/lapack-netlib/SRC/sggrqf.c b/lapack-netlib/SRC/sggrqf.c index b4a32dda23..02fd8dacc8 100644 --- a/lapack-netlib/SRC/sggrqf.c +++ b/lapack-netlib/SRC/sggrqf.c @@ -727,7 +727,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggrqf_(integer *m, integer *p, integer *n, real *a, +/* Subroutine */ void sggrqf_(integer *m, integer *p, integer *n, real *a, integer *lda, real *taua, real *b, integer *ldb, real *taub, real * work, integer *lwork, integer *info) { @@ -739,13 +739,13 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), sgerqf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer * ); integer nb1, nb2, nb3, lwkopt; logical lquery; - extern /* Subroutine */ int sormrq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -807,9 +807,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGGRQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* RQ factorization of M-by-N matrix A: A = R*Q */ @@ -835,7 +835,7 @@ f"> */ i__1 = lopt, i__2 = (integer) work[1]; work[1] = (real) f2cmax(i__1,i__2); - return 0; + return; /* End of SGGRQF */ diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f index c4a78c3477..8b7d4786aa 100644 --- a/lapack-netlib/SRC/sggrqf.f +++ b/lapack-netlib/SRC/sggrqf.f @@ -275,7 +275,7 @@ SUBROUTINE SGGRQF( M, P, N, A, LDA, TAUA, B, LDB, TAUB, WORK, * RQ factorization of M-by-N matrix A: A = R*Q * CALL SGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = WORK( 1 ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**T * diff --git a/lapack-netlib/SRC/sggsvd3.c b/lapack-netlib/SRC/sggsvd3.c index a8c458ef30..d17bf8e865 100644 --- a/lapack-netlib/SRC/sggsvd3.c +++ b/lapack-netlib/SRC/sggsvd3.c @@ -861,7 +861,7 @@ static integer c__1 = 1; /* > SGGSVD3 replaces the deprecated subroutine SGGSVD. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void sggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, real *a, integer *lda, real *b, integer *ldb, real *alpha, real *beta, real *u, integer * ldu, real *v, integer *ldv, real *q, integer *ldq, real *work, @@ -880,19 +880,20 @@ static integer c__1 = 1; extern logical lsame_(char *, char *); real anorm, bnorm; logical wantq; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical wantu, wantv; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), stgsja_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stgsja_( char *, char *, char *, integer *, integer *, integer *, integer * , integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sggsvp3_(char *, char *, char *, integer *, + extern /* Subroutine */ void sggsvp3_(char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , real *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, real *, real *, integer * @@ -986,10 +987,10 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SGGSVD3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1052,7 +1053,7 @@ static integer c__1 = 1; } work[1] = (real) lwkopt; - return 0; + return; /* End of SGGSVD3 */ diff --git a/lapack-netlib/SRC/sggsvp3.c b/lapack-netlib/SRC/sggsvp3.c index c18609af1f..b01d80f7e0 100644 --- a/lapack-netlib/SRC/sggsvp3.c +++ b/lapack-netlib/SRC/sggsvp3.c @@ -785,7 +785,7 @@ static real c_b24 = 1.f; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void sggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, real *a, integer *lda, real *b, integer *ldb, real *tola, real *tolb, integer *k, integer *l, real *u, integer *ldu, real *v, integer *ldv, real *q, integer *ldq, integer *iwork, real * @@ -800,7 +800,7 @@ static real c_b24 = 1.f; integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int sgeqp3_(integer *, integer *, real *, integer + extern /* Subroutine */ void sgeqp3_(integer *, integer *, real *, integer *, integer *, real *, real *, integer *, integer *), sgeqr2_( integer *, integer *, real *, integer *, real *, real *, integer * ), sgerq2_(integer *, integer *, real *, integer *, real *, real * @@ -809,8 +809,9 @@ static real c_b24 = 1.f; integer *, integer *, integer *, real *, integer *, real *, real * , integer *, real *, integer *), sormr2_(char *, char *, integer *, integer *, integer *, real *, integer *, real * - , real *, integer *, real *, integer *), xerbla_( - char *, integer *, ftnlen), slacpy_(char *, integer *, integer *, + , real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slapmt_(logical *, integer *, integer *, real *, integer *, @@ -917,10 +918,10 @@ static real c_b24 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SGGSVP3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1172,7 +1173,7 @@ static real c_b24 = 1.f; } work[1] = (real) lwkopt; - return 0; + return; /* End of SGGSVP3 */ diff --git a/lapack-netlib/SRC/sgsvj0.c b/lapack-netlib/SRC/sgsvj0.c index feae7303f3..e969997da9 100644 --- a/lapack-netlib/SRC/sgsvj0.c +++ b/lapack-netlib/SRC/sgsvj0.c @@ -732,7 +732,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* ===================================================================== */ -/* Subroutine */ int sgsvj0_(char *jobv, integer *m, integer *n, real *a, +/* Subroutine */ void sgsvj0_(char *jobv, integer *m, integer *n, real *a, integer *lda, real *d__, real *sva, integer *mv, real *v, integer * ldv, real *eps, real *sfmin, real *tol, integer *nsweep, real *work, integer *lwork, integer *info) @@ -755,22 +755,22 @@ f"> */ extern logical lsame_(char *, char *); real theta, small, fastr[5]; logical applv, rsvec; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical rotok; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), srotm_(integer *, real *, integer *, real *, integer * , real *); real rootsfmin, cs, sn; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); integer blskip; real mxaapq, thsign; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real mxsinj; integer ir1, emptsw, notrot, iswrot, jbc; @@ -832,7 +832,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGSVJ0", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1711,6 +1711,6 @@ f"> */ /* L5991: */ } - return 0; + return; } /* sgsvj0_ */ diff --git a/lapack-netlib/SRC/sgsvj1.c b/lapack-netlib/SRC/sgsvj1.c index 10bc943248..804a2ec295 100644 --- a/lapack-netlib/SRC/sgsvj1.c +++ b/lapack-netlib/SRC/sgsvj1.c @@ -751,7 +751,7 @@ f"> */ /* > Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany) */ /* ===================================================================== */ -/* Subroutine */ int sgsvj1_(char *jobv, integer *m, integer *n, integer *n1, +/* Subroutine */ void sgsvj1_(char *jobv, integer *m, integer *n, integer *n1, real *a, integer *lda, real *d__, real *sva, integer *mv, real *v, integer *ldv, real *eps, real *sfmin, real *tol, integer *nsweep, real *work, integer *lwork, integer *info) @@ -775,22 +775,22 @@ f"> */ extern logical lsame_(char *, char *); real theta, small, fastr[5]; logical applv, rsvec; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); logical rotok; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), srotm_(integer *, real *, integer *, real *, integer * , real *); real rootsfmin, cs, sn; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); extern integer isamax_(integer *, real *, integer *); integer blskip; real mxaapq, thsign; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real mxsinj; integer emptsw, notrot, iswrot, jbc; @@ -854,7 +854,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGSVJ1", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1353,6 +1353,6 @@ f"> */ /* L5991: */ } - return 0; + return; } /* sgsvj1_ */ diff --git a/lapack-netlib/SRC/sgtcon.c b/lapack-netlib/SRC/sgtcon.c index 9f4e5cbdff..93bcb4e905 100644 --- a/lapack-netlib/SRC/sgtcon.c +++ b/lapack-netlib/SRC/sgtcon.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup realGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgtcon_(char *norm, integer *n, real *dl, real *d__, +/* Subroutine */ void sgtcon_(char *norm, integer *n, real *dl, real *d__, real *du, real *du2, integer *ipiv, real *anorm, real *rcond, real * work, integer *iwork, integer *info) { @@ -669,11 +669,12 @@ f"> */ integer kase, kase1, i__; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, - real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, + real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; logical onenrm; - extern /* Subroutine */ int sgttrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgttrs_(char *, integer *, integer *, real *, real *, real *, real *, integer *, real *, integer *, integer *); @@ -710,7 +711,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -718,9 +719,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } /* Check that D(1:N) is non-zero. */ @@ -728,7 +729,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (d__[i__] == 0.f) { - return 0; + return; } /* L10: */ } @@ -765,7 +766,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of SGTCON */ diff --git a/lapack-netlib/SRC/sgtrfs.c b/lapack-netlib/SRC/sgtrfs.c index 277b3f826e..0b1a60c4a1 100644 --- a/lapack-netlib/SRC/sgtrfs.c +++ b/lapack-netlib/SRC/sgtrfs.c @@ -722,7 +722,7 @@ f"> */ /* > \ingroup realGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgtrfs_(char *trans, integer *n, integer *nrhs, real *dl, +/* Subroutine */ void sgtrfs_(char *trans, integer *n, integer *nrhs, real *dl, real *d__, real *du, real *dlf, real *df, real *duf, real *du2, integer *ipiv, real *b, integer *ldb, real *x, integer *ldx, real * ferr, real *berr, real *work, integer *iwork, integer *info) @@ -738,20 +738,21 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3], count; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slagtm_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slagtm_( char *, integer *, integer *, real *, real *, real *, real *, real *, integer *, real *, real *, integer *); logical notran; char transn[1], transt[1]; real lstres; - extern /* Subroutine */ int sgttrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgttrs_(char *, integer *, integer *, real *, real *, real *, real *, integer *, real *, integer *, integer *); real eps; @@ -805,7 +806,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -817,7 +818,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1029,7 +1030,7 @@ f"> */ /* L110: */ } - return 0; + return; /* End of SGTRFS */ diff --git a/lapack-netlib/SRC/sgtsv.c b/lapack-netlib/SRC/sgtsv.c index 7e2f4487eb..3dfa4eead7 100644 --- a/lapack-netlib/SRC/sgtsv.c +++ b/lapack-netlib/SRC/sgtsv.c @@ -636,7 +636,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realGTsolve */ /* ===================================================================== */ -/* Subroutine */ int sgtsv_(integer *n, integer *nrhs, real *dl, real *d__, +/* Subroutine */ void sgtsv_(integer *n, integer *nrhs, real *dl, real *d__, real *du, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -678,11 +678,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SGTSV ", &i__1, (ftnlen)5); - return 0; + return; } if (*n == 0) { - return 0; + return; } if (*nrhs == 1) { @@ -698,7 +698,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ b[i__ + 1 + b_dim1] -= fact * b[i__ + b_dim1]; } else { *info = i__; - return 0; + return; } dl[i__] = 0.f; } else { @@ -727,7 +727,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ b[i__ + 1 + b_dim1] -= fact * b[i__ + b_dim1]; } else { *info = i__; - return 0; + return; } } else { fact = d__[i__] / dl[i__]; @@ -742,7 +742,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (d__[*n] == 0.f) { *info = *n; - return 0; + return; } } else { i__1 = *n - 2; @@ -761,7 +761,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } else { *info = i__; - return 0; + return; } dl[i__] = 0.f; } else { @@ -799,7 +799,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } else { *info = i__; - return 0; + return; } } else { fact = d__[i__] / dl[i__]; @@ -819,7 +819,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (d__[*n] == 0.f) { *info = *n; - return 0; + return; } } @@ -861,7 +861,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of SGTSV */ diff --git a/lapack-netlib/SRC/sgtsvx.c b/lapack-netlib/SRC/sgtsvx.c index cea42826b0..f2466262be 100644 --- a/lapack-netlib/SRC/sgtsvx.c +++ b/lapack-netlib/SRC/sgtsvx.c @@ -804,7 +804,7 @@ f"> */ /* > \ingroup realGTsolve */ /* ===================================================================== */ -/* Subroutine */ int sgtsvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void sgtsvx_(char *fact, char *trans, integer *n, integer * nrhs, real *dl, real *d__, real *du, real *dlf, real *df, real *duf, real *du2, integer *ipiv, real *b, integer *ldb, real *x, integer * ldx, real *rcond, real *ferr, real *berr, real *work, integer *iwork, @@ -817,18 +817,18 @@ f"> */ char norm[1]; extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamch_(char *); logical nofact; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slangt_(char *, integer *, real *, real *, real *); - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), sgtcon_(char *, integer *, real *, real *, real *, real *, integer *, real *, real *, real *, integer *, integer *); logical notran; - extern /* Subroutine */ int sgtrfs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgtrfs_(char *, integer *, integer *, real *, real *, real *, real *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *), sgttrf_(integer *, real *, real *, real *, @@ -887,7 +887,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -907,7 +907,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -944,7 +944,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of SGTSVX */ diff --git a/lapack-netlib/SRC/sgttrf.c b/lapack-netlib/SRC/sgttrf.c index b8580a6460..e2b981e2b7 100644 --- a/lapack-netlib/SRC/sgttrf.c +++ b/lapack-netlib/SRC/sgttrf.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup realGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgttrf_(integer *n, real *dl, real *d__, real *du, real * +/* Subroutine */ void sgttrf_(integer *n, real *dl, real *d__, real *du, real * du2, integer *ipiv, integer *info) { /* System generated locals */ @@ -668,13 +668,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("SGTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize IPIV(i) = i and DU2(I) = 0 */ @@ -748,7 +748,7 @@ f"> */ } L50: - return 0; + return; /* End of SGTTRF */ diff --git a/lapack-netlib/SRC/sgttrs.c b/lapack-netlib/SRC/sgttrs.c index b8ca4cc57e..07d58f48a5 100644 --- a/lapack-netlib/SRC/sgttrs.c +++ b/lapack-netlib/SRC/sgttrs.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup realGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgttrs_(char *trans, integer *n, integer *nrhs, real *dl, +/* Subroutine */ void sgttrs_(char *trans, integer *n, integer *nrhs, real *dl, real *d__, real *du, real *du2, integer *ipiv, real *b, integer *ldb, integer *info) { @@ -660,9 +660,9 @@ f"> */ /* Local variables */ integer j, jb, nb; - extern /* Subroutine */ int sgtts2_(integer *, integer *, integer *, real - *, real *, real *, real *, integer *, real *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void sgtts2_(integer *, integer *, integer *, real + *, real *, real *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer itrans; @@ -705,13 +705,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SGTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Decode TRANS */ @@ -751,6 +751,6 @@ f"> */ /* End of SGTTRS */ - return 0; + return; } /* sgttrs_ */ diff --git a/lapack-netlib/SRC/sgtts2.c b/lapack-netlib/SRC/sgtts2.c index d0c675d294..685e507901 100644 --- a/lapack-netlib/SRC/sgtts2.c +++ b/lapack-netlib/SRC/sgtts2.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup realGTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sgtts2_(integer *itrans, integer *n, integer *nrhs, real +/* Subroutine */ void sgtts2_(integer *itrans, integer *n, integer *nrhs, real *dl, real *d__, real *du, real *du2, integer *ipiv, real *b, integer * ldb) { @@ -673,7 +673,7 @@ f"> */ /* Function Body */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (*itrans == 0) { @@ -825,6 +825,6 @@ f"> */ /* End of SGTTS2 */ - return 0; + return; } /* sgtts2_ */ diff --git a/lapack-netlib/SRC/shgeqz.c b/lapack-netlib/SRC/shgeqz.c index e1ff562547..bd21f39633 100644 --- a/lapack-netlib/SRC/shgeqz.c +++ b/lapack-netlib/SRC/shgeqz.c @@ -818,7 +818,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int shgeqz_(char *job, char *compq, char *compz, integer *n, +/* Subroutine */ void shgeqz_(char *job, char *compq, char *compz, integer *n, integer *ilo, integer *ihi, real *h__, integer *ldh, real *t, integer *ldt, real *alphar, real *alphai, real *beta, real *q, integer *ldq, real *z__, integer *ldz, real *work, integer *lwork, integer *info) @@ -830,7 +830,7 @@ f"> */ /* Local variables */ real ad11l, ad12l, ad21l, ad22l, ad32l, wabs, atol, btol, temp; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *), slag2_(real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *); real temp2, s1inv, c__; @@ -847,7 +847,7 @@ f"> */ integer jc; extern real slapy3_(real *, real *, real *); real an, bn, cl; - extern /* Subroutine */ int slasv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slasv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *); real cq, cr; integer in; @@ -856,7 +856,7 @@ f"> */ real cz, sl, w12, w21, w22, wi, sr; extern real slamch_(char *); real vs, wr, safmin; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); real safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -874,7 +874,7 @@ f"> */ integer istart; logical ilpivt; real a2r, b1r, b2i, b2r; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); logical lquery; @@ -985,16 +985,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SHGEQZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { work[1] = 1.f; - return 0; + return; } /* Initialize Q and Z */ @@ -2092,7 +2092,7 @@ f"> */ L420: work[1] = (real) (*n); - return 0; + return; /* End of SHGEQZ */ diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f index 79a9c60925..6543f8cb18 100644 --- a/lapack-netlib/SRC/shgeqz.f +++ b/lapack-netlib/SRC/shgeqz.f @@ -337,9 +337,9 @@ SUBROUTINE SHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, $ BTOL, C, C11I, C11R, C12, C21, C22I, C22R, CL, $ CQ, CR, CZ, ESHIFT, S, S1, S1INV, S2, SAFMAX, $ SAFMIN, SCALE, SL, SQI, SQR, SR, SZI, SZR, T1, - $ TAU, TEMP, TEMP2, TEMPI, TEMPR, U1, U12, U12L, - $ U2, ULP, VS, W11, W12, W21, W22, WABS, WI, WR, - $ WR2 + $ T2, T3, TAU, TEMP, TEMP2, TEMPI, TEMPR, U1, + $ U12, U12L, U2, ULP, VS, W11, W12, W21, W22, + $ WABS, WI, WR, WR2 * .. * .. Local Arrays .. REAL V( 3 ) @@ -536,9 +536,7 @@ SUBROUTINE SHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = ZERO GO TO 70 END IF @@ -564,10 +562,7 @@ SUBROUTINE SHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = ZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A @@ -1132,25 +1127,27 @@ SUBROUTINE SHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, H( J+2, J-1 ) = ZERO END IF * + T2 = TAU * V( 2 ) + T3 = TAU * V( 3 ) DO 230 JC = J, ILASTM - TEMP = TAU*( H( J, JC )+V( 2 )*H( J+1, JC )+V( 3 )* - $ H( J+2, JC ) ) - H( J, JC ) = H( J, JC ) - TEMP - H( J+1, JC ) = H( J+1, JC ) - TEMP*V( 2 ) - H( J+2, JC ) = H( J+2, JC ) - TEMP*V( 3 ) - TEMP2 = TAU*( T( J, JC )+V( 2 )*T( J+1, JC )+V( 3 )* - $ T( J+2, JC ) ) - T( J, JC ) = T( J, JC ) - TEMP2 - T( J+1, JC ) = T( J+1, JC ) - TEMP2*V( 2 ) - T( J+2, JC ) = T( J+2, JC ) - TEMP2*V( 3 ) + TEMP = H( J, JC )+V( 2 )*H( J+1, JC )+V( 3 )* + $ H( J+2, JC ) + H( J, JC ) = H( J, JC ) - TEMP*TAU + H( J+1, JC ) = H( J+1, JC ) - TEMP*T2 + H( J+2, JC ) = H( J+2, JC ) - TEMP*T3 + TEMP2 = T( J, JC )+V( 2 )*T( J+1, JC )+V( 3 )* + $ T( J+2, JC ) + T( J, JC ) = T( J, JC ) - TEMP2*TAU + T( J+1, JC ) = T( J+1, JC ) - TEMP2*T2 + T( J+2, JC ) = T( J+2, JC ) - TEMP2*T3 230 CONTINUE IF( ILQ ) THEN DO 240 JR = 1, N - TEMP = TAU*( Q( JR, J )+V( 2 )*Q( JR, J+1 )+V( 3 )* - $ Q( JR, J+2 ) ) - Q( JR, J ) = Q( JR, J ) - TEMP - Q( JR, J+1 ) = Q( JR, J+1 ) - TEMP*V( 2 ) - Q( JR, J+2 ) = Q( JR, J+2 ) - TEMP*V( 3 ) + TEMP = Q( JR, J )+V( 2 )*Q( JR, J+1 )+V( 3 )* + $ Q( JR, J+2 ) + Q( JR, J ) = Q( JR, J ) - TEMP*TAU + Q( JR, J+1 ) = Q( JR, J+1 ) - TEMP*T2 + Q( JR, J+2 ) = Q( JR, J+2 ) - TEMP*T3 240 CONTINUE END IF * @@ -1238,27 +1235,29 @@ SUBROUTINE SHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, * * Apply transformations from the right. * + T2 = TAU*V( 2 ) + T3 = TAU*V( 3 ) DO 260 JR = IFRSTM, MIN( J+3, ILAST ) - TEMP = TAU*( H( JR, J )+V( 2 )*H( JR, J+1 )+V( 3 )* - $ H( JR, J+2 ) ) - H( JR, J ) = H( JR, J ) - TEMP - H( JR, J+1 ) = H( JR, J+1 ) - TEMP*V( 2 ) - H( JR, J+2 ) = H( JR, J+2 ) - TEMP*V( 3 ) + TEMP = H( JR, J )+V( 2 )*H( JR, J+1 )+V( 3 )* + $ H( JR, J+2 ) + H( JR, J ) = H( JR, J ) - TEMP*TAU + H( JR, J+1 ) = H( JR, J+1 ) - TEMP*T2 + H( JR, J+2 ) = H( JR, J+2 ) - TEMP*T3 260 CONTINUE DO 270 JR = IFRSTM, J + 2 - TEMP = TAU*( T( JR, J )+V( 2 )*T( JR, J+1 )+V( 3 )* - $ T( JR, J+2 ) ) - T( JR, J ) = T( JR, J ) - TEMP - T( JR, J+1 ) = T( JR, J+1 ) - TEMP*V( 2 ) - T( JR, J+2 ) = T( JR, J+2 ) - TEMP*V( 3 ) + TEMP = T( JR, J )+V( 2 )*T( JR, J+1 )+V( 3 )* + $ T( JR, J+2 ) + T( JR, J ) = T( JR, J ) - TEMP*TAU + T( JR, J+1 ) = T( JR, J+1 ) - TEMP*T2 + T( JR, J+2 ) = T( JR, J+2 ) - TEMP*T3 270 CONTINUE IF( ILZ ) THEN DO 280 JR = 1, N - TEMP = TAU*( Z( JR, J )+V( 2 )*Z( JR, J+1 )+V( 3 )* - $ Z( JR, J+2 ) ) - Z( JR, J ) = Z( JR, J ) - TEMP - Z( JR, J+1 ) = Z( JR, J+1 ) - TEMP*V( 2 ) - Z( JR, J+2 ) = Z( JR, J+2 ) - TEMP*V( 3 ) + TEMP = Z( JR, J )+V( 2 )*Z( JR, J+1 )+V( 3 )* + $ Z( JR, J+2 ) + Z( JR, J ) = Z( JR, J ) - TEMP*TAU + Z( JR, J+1 ) = Z( JR, J+1 ) - TEMP*T2 + Z( JR, J+2 ) = Z( JR, J+2 ) - TEMP*T3 280 CONTINUE END IF T( J+1, J ) = ZERO diff --git a/lapack-netlib/SRC/shsein.c b/lapack-netlib/SRC/shsein.c index 2d1dd81eee..ad906df84f 100644 --- a/lapack-netlib/SRC/shsein.c +++ b/lapack-netlib/SRC/shsein.c @@ -775,7 +775,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int shsein_(char *side, char *eigsrc, char *initv, logical * +/* Subroutine */ void shsein_(char *side, char *eigsrc, char *initv, logical * select, integer *n, real *h__, integer *ldh, real *wr, real *wi, real *vl, integer *ldvl, real *vr, integer *ldvr, integer *mm, integer *m, real *work, integer *ifaill, integer *ifailr, integer *info) @@ -795,10 +795,10 @@ f"> */ real hnorm; integer kl, kr; extern real slamch_(char *); - extern /* Subroutine */ int slaein_(logical *, logical *, integer *, real + extern /* Subroutine */ void slaein_(logical *, logical *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *, - real *, real *, real *, real *, integer *), xerbla_(char *, - integer *, ftnlen); + real *, real *, real *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real bignum; extern real slanhs_(char *, integer *, real *, integer *, real *); extern logical sisnan_(real *); @@ -896,13 +896,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SHSEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set machine-dependent constants. */ @@ -975,7 +975,7 @@ f"> */ work[1]); if (sisnan_(&hnorm)) { *info = -6; - return 0; + return; } else if (hnorm > 0.f) { eps3 = hnorm * ulp; } else { @@ -1084,7 +1084,7 @@ f"> */ /* L120: */ } - return 0; + return; /* End of SHSEIN */ diff --git a/lapack-netlib/SRC/shseqr.c b/lapack-netlib/SRC/shseqr.c index c970be98b2..4ab1e4d257 100644 --- a/lapack-netlib/SRC/shseqr.c +++ b/lapack-netlib/SRC/shseqr.c @@ -832,7 +832,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* ===================================================================== */ -/* Subroutine */ int shseqr_(char *job, char *compz, integer *n, integer *ilo, +/* Subroutine */ void shseqr_(char *job, char *compz, integer *n, integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real *wi, real *z__, integer *ldz, real *work, integer *lwork, integer *info) { @@ -848,14 +848,14 @@ f"> */ logical initz; real workl[49]; logical wantt, wantz; - extern /* Subroutine */ int slaqr0_(logical *, logical *, integer *, + extern /* Subroutine */ void slaqr0_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , integer *, real *, integer *, real *, integer *, integer *); real hl[2401] /* was [49][49] */; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void slahqr_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, @@ -928,13 +928,13 @@ f"> */ i__1 = -(*info); xerbla_("SHSEQR", &i__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { /* ==== Quick return in case N = 0; nothing to do. ==== */ - return 0; + return; } else if (lquery) { @@ -947,7 +947,7 @@ f"> */ /* Computing MAX */ r__1 = (real) f2cmax(1,*n); work[1] = f2cmax(r__1,work[1]); - return 0; + return; } else { @@ -978,7 +978,7 @@ f"> */ if (*ilo == *ihi) { wr[*ilo] = h__[*ilo + *ilo * h_dim1]; wi[*ilo] = 0.f; - return 0; + return; } /* ==== SLAHQR/SLAQR0 crossover point ==== */ @@ -1060,6 +1060,6 @@ f"> */ /* ==== End of SHSEQR ==== */ - return 0; + return; } /* shseqr_ */ diff --git a/lapack-netlib/SRC/sla_gbamv.c b/lapack-netlib/SRC/sla_gbamv.c index 97c990aebe..6b59f704dc 100644 --- a/lapack-netlib/SRC/sla_gbamv.c +++ b/lapack-netlib/SRC/sla_gbamv.c @@ -693,7 +693,7 @@ mv.f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_gbamv_(integer *trans, integer *m, integer *n, +/* Subroutine */ void sla_gbamv_(integer *trans, integer *m, integer *n, integer *kl, integer *ku, real *alpha, real *ab, integer *ldab, real * x, integer *incx, real *beta, real *y, integer *incy) { @@ -752,13 +752,13 @@ mv.f"> */ } if (info != 0) { xerbla_("SLA_GBAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -931,7 +931,7 @@ mv.f"> */ } } - return 0; + return; /* End of SLA_GBAMV */ diff --git a/lapack-netlib/SRC/sla_gbrcond.c b/lapack-netlib/SRC/sla_gbrcond.c index fd94ad4096..1f18bee252 100644 --- a/lapack-netlib/SRC/sla_gbrcond.c +++ b/lapack-netlib/SRC/sla_gbrcond.c @@ -692,12 +692,12 @@ real sla_gbrcond_(char *trans, integer *n, integer *kl, integer *ku, real * integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer kd, ke; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int sgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbtrs_(char *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.c b/lapack-netlib/SRC/sla_gbrfsx_extended.c index e001795c27..e84e3b2e99 100644 --- a/lapack-netlib/SRC/sla_gbrfsx_extended.c +++ b/lapack-netlib/SRC/sla_gbrfsx_extended.c @@ -921,7 +921,7 @@ fsx_extended.f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_gbrfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void sla_gbrfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *kl, integer *ku, integer *nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, integer *ipiv, logical *colequ, real *c__, real *b, integer *ldb, real *y, integer * @@ -939,13 +939,13 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__, ymin; - extern /* Subroutine */ int sla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void sla_lin_berr_(integer *, integer *, integer * , real *, real *, real *), blas_sgbmv_x__(integer *, integer *, integer *, integer *, integer *, real *, real *, integer *, real * , integer *, real *, real *, integer *, integer *); real dxratmax, dzratmax; integer y_prec_state__, i__, j, m; - extern /* Subroutine */ int blas_sgbmv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_sgbmv2_x_(integer *, integer *, integer *, integer *, integer *, real *, real *, integer *, real *, real * , integer *, real *, real *, integer *, integer *), sla_gbamv__( integer *, integer *, integer *, integer *, integer *, real *, @@ -956,19 +956,19 @@ fsx_extended.f"> */ logical incr_prec__; real dzrat; char trans[1]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real normx, normy; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); real myhugeval, prev_dz_z__, yk; extern real slamch_(char *); real final_dx_x__, final_dz_z__; - extern /* Subroutine */ int sgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void sgbtrs_(char *, integer *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real normdx; - extern /* Subroutine */ int sla_wwaddw_(integer *, real *, real *, real * + extern /* Subroutine */ void sla_wwaddw_(integer *, real *, real *, real * ); extern /* Character */ VOID chla_transtype_(char *, integer *); real prevnormdx; @@ -1016,7 +1016,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1254,6 +1254,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* sla_gbrfsx_extended__ */ diff --git a/lapack-netlib/SRC/sla_geamv.c b/lapack-netlib/SRC/sla_geamv.c index a30abd28a0..338f6070e7 100644 --- a/lapack-netlib/SRC/sla_geamv.c +++ b/lapack-netlib/SRC/sla_geamv.c @@ -682,7 +682,7 @@ mv.f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_geamv_(integer *trans, integer *m, integer *n, real +/* Subroutine */ void sla_geamv_(integer *trans, integer *m, integer *n, real *alpha, real *a, integer *lda, real *x, integer *incx, real *beta, real *y, integer *incy) { @@ -738,13 +738,13 @@ mv.f"> */ } if (info != 0) { xerbla_("SLA_GEAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -895,7 +895,7 @@ mv.f"> */ } } - return 0; + return; /* End of SLA_GEAMV */ diff --git a/lapack-netlib/SRC/sla_gercond.c b/lapack-netlib/SRC/sla_gercond.c index 448604d24d..72198dc497 100644 --- a/lapack-netlib/SRC/sla_gercond.c +++ b/lapack-netlib/SRC/sla_gercond.c @@ -674,10 +674,11 @@ real sla_gercond_(char *trans, integer *n, real *a, integer *lda, real *af, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, - real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, + real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int sgetrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgetrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real tmp; logical notrans; diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.c b/lapack-netlib/SRC/sla_gerfsx_extended.c index 13cba47c01..ca801473a2 100644 --- a/lapack-netlib/SRC/sla_gerfsx_extended.c +++ b/lapack-netlib/SRC/sla_gerfsx_extended.c @@ -909,7 +909,7 @@ fsx_extended.f"> */ /* > \ingroup realGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_gerfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void sla_gerfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *nrhs, real *a, integer *lda, real * af, integer *ldaf, integer *ipiv, logical *colequ, real *c__, real *b, integer *ldb, real *y, integer *ldy, real *berr_out__, integer * @@ -926,15 +926,15 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__, ymin; - extern /* Subroutine */ int sla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void sla_lin_berr_(integer *, integer *, integer * , real *, real *, real *); real dxratmax; - extern /* Subroutine */ int blas_sgemv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_sgemv_x_(integer *, integer *, integer * , real *, real *, integer *, real *, integer *, real *, real *, integer *, integer *); real dzratmax; integer y_prec_state__, i__, j; - extern /* Subroutine */ int blas_sgemv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_sgemv2_x_(integer *, integer *, integer *, real *, real *, integer *, real *, real *, integer *, real *, real *, integer *, integer *), sla_geamv_(integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, @@ -944,15 +944,15 @@ fsx_extended.f"> */ logical incr_prec__; real dzrat; char trans[1]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real normx, normy; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); real myhugeval, prev_dz_z__, yk; extern real slamch_(char *); real final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int sgetrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgetrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), sla_wwaddw_(integer *, real *, real *, real *); extern /* Character */ VOID chla_transtype_(char *, integer *); @@ -1001,7 +1001,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1238,6 +1238,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* sla_gerfsx_extended__ */ diff --git a/lapack-netlib/SRC/sla_lin_berr.c b/lapack-netlib/SRC/sla_lin_berr.c index 0fdda2d8e9..199c8c2c20 100644 --- a/lapack-netlib/SRC/sla_lin_berr.c +++ b/lapack-netlib/SRC/sla_lin_berr.c @@ -610,7 +610,7 @@ _berr.f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_lin_berr_(integer *n, integer *nz, integer *nrhs, +/* Subroutine */ void sla_lin_berr_(integer *n, integer *nz, integer *nrhs, real *res, real *ayb, real *berr) { /* System generated locals */ @@ -667,6 +667,6 @@ _berr.f"> */ } } - return 0; + return; } /* sla_lin_berr__ */ diff --git a/lapack-netlib/SRC/sla_porcond.c b/lapack-netlib/SRC/sla_porcond.c index 7fe74c57e4..7fde8818aa 100644 --- a/lapack-netlib/SRC/sla_porcond.c +++ b/lapack-netlib/SRC/sla_porcond.c @@ -664,12 +664,12 @@ real sla_porcond_(char *uplo, integer *n, real *a, integer *lda, real *af, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int spotrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spotrs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/sla_porfsx_extended.c b/lapack-netlib/SRC/sla_porfsx_extended.c index 04eae3caa1..1f0f09e241 100644 --- a/lapack-netlib/SRC/sla_porfsx_extended.c +++ b/lapack-netlib/SRC/sla_porfsx_extended.c @@ -899,7 +899,7 @@ fsx_extended.f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_porfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void sla_porfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *af, integer * ldaf, logical *colequ, real *c__, real *b, integer *ldb, real *y, integer *ldy, real *berr_out__, integer *n_norms__, real * @@ -915,11 +915,11 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__, ymin; - extern /* Subroutine */ int sla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void sla_lin_berr_(integer *, integer *, integer * , real *, real *, real *); real dxratmax, dzratmax; integer y_prec_state__, uplo2; - extern /* Subroutine */ int blas_ssymv_x_(integer *, integer *, real *, + extern /* Subroutine */ void blas_ssymv_x_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, integer *); integer i__, j; @@ -927,21 +927,21 @@ fsx_extended.f"> */ real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int blas_ssymv2_x_(integer *, integer *, real *, + extern /* Subroutine */ void blas_ssymv2_x_(integer *, integer *, real *, real *, integer *, real *, real *, integer *, real *, real *, integer *, integer *), scopy_(integer *, real *, integer *, real * , integer *); real normx, normy; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), sla_syamv_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real myhugeval, prev_dz_z__; - extern /* Subroutine */ int ssymv_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real yk; extern real slamch_(char *); real final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int sla_wwaddw_(integer *, real *, real *, real * + extern /* Subroutine */ void sla_wwaddw_(integer *, real *, real *, real * ), spotrs_(char *, integer *, integer *, real *, integer *, real * , integer *, integer *); real prevnormdx; @@ -989,7 +989,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } eps = slamch_("Epsilon"); myhugeval = slamch_("Overflow"); @@ -1214,6 +1214,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* sla_porfsx_extended__ */ diff --git a/lapack-netlib/SRC/sla_syamv.c b/lapack-netlib/SRC/sla_syamv.c index d90b83cdd1..5222376b63 100644 --- a/lapack-netlib/SRC/sla_syamv.c +++ b/lapack-netlib/SRC/sla_syamv.c @@ -686,7 +686,7 @@ mv.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sla_syamv_(integer *uplo, integer *n, real *alpha, real +/* Subroutine */ void sla_syamv_(integer *uplo, integer *n, real *alpha, real *a, integer *lda, real *x, integer *incx, real *beta, real *y, integer *incy) { @@ -739,13 +739,13 @@ mv.f"> */ } if (info != 0) { xerbla_("SLA_SYAMV", &info, (ftnlen)9); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.f && *beta == 1.f) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -918,7 +918,7 @@ mv.f"> */ } } - return 0; + return; /* End of SLA_SYAMV */ diff --git a/lapack-netlib/SRC/sla_syrcond.c b/lapack-netlib/SRC/sla_syrcond.c index 4a4efba4fb..a61e0450e4 100644 --- a/lapack-netlib/SRC/sla_syrcond.c +++ b/lapack-netlib/SRC/sla_syrcond.c @@ -670,7 +670,7 @@ real sla_syrcond_(char *uplo, integer *n, real *a, integer *lda, real *af, integer kase, i__, j; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); logical up; extern real slamch_(char *); @@ -678,7 +678,7 @@ real sla_syrcond_(char *uplo, integer *n, real *a, integer *lda, real *af, real ainvnm; char normin[1]; real smlnum; - extern /* Subroutine */ int ssytrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssytrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real tmp; diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.c b/lapack-netlib/SRC/sla_syrfsx_extended.c index 898575d244..936d9a36b5 100644 --- a/lapack-netlib/SRC/sla_syrfsx_extended.c +++ b/lapack-netlib/SRC/sla_syrfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_syrfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void sla_syrfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *af, integer * ldaf, integer *ipiv, logical *colequ, real *c__, real *b, integer * ldb, real *y, integer *ldy, real *berr_out__, integer *n_norms__, @@ -923,11 +923,11 @@ fsx_extended.f"> */ /* Local variables */ real dx_x__, dz_z__, ymin; - extern /* Subroutine */ int sla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void sla_lin_berr_(integer *, integer *, integer * , real *, real *, real *); real dxratmax, dzratmax; integer y_prec_state__, uplo2; - extern /* Subroutine */ int blas_ssymv_x_(integer *, integer *, real *, + extern /* Subroutine */ void blas_ssymv_x_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, integer *); integer i__, j; @@ -935,24 +935,24 @@ fsx_extended.f"> */ real dxrat; logical incr_prec__; real dzrat; - extern /* Subroutine */ int blas_ssymv2_x_(integer *, integer *, real *, + extern /* Subroutine */ void blas_ssymv2_x_(integer *, integer *, real *, real *, integer *, real *, real *, integer *, real *, real *, integer *, integer *); logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real normx, normy; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), sla_syamv_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real myhugeval, prev_dz_z__; - extern /* Subroutine */ int ssymv_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real yk; extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int sla_wwaddw_(integer *, real *, real *, real * + extern /* Subroutine */ void sla_wwaddw_(integer *, real *, real *, real * ), ssytrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real prevnormdx; @@ -1020,7 +1020,7 @@ fsx_extended.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLA_SYRFSX_EXTENDED", &i__1, (ftnlen)19); - return 0; + return; } eps = slamch_("Epsilon"); myhugeval = slamch_("Overflow"); @@ -1245,6 +1245,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* sla_syrfsx_extended__ */ diff --git a/lapack-netlib/SRC/sla_wwaddw.c b/lapack-netlib/SRC/sla_wwaddw.c index 4b9b3d5cd1..c14f5676f2 100644 --- a/lapack-netlib/SRC/sla_wwaddw.c +++ b/lapack-netlib/SRC/sla_wwaddw.c @@ -590,7 +590,7 @@ ddw.f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sla_wwaddw_(integer *n, real *x, real *y, real *w) +/* Subroutine */ void sla_wwaddw_(integer *n, real *x, real *y, real *w) { /* System generated locals */ integer i__1; @@ -623,6 +623,6 @@ ddw.f"> */ x[i__] = s; /* L10: */ } - return 0; + return; } /* sla_wwaddw__ */ diff --git a/lapack-netlib/SRC/slabad.c b/lapack-netlib/SRC/slabad.c index 3cdfd0bd99..0c7fbe0a3b 100644 --- a/lapack-netlib/SRC/slabad.c +++ b/lapack-netlib/SRC/slabad.c @@ -585,7 +585,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slabad_(real *small, real *large) +/* Subroutine */ void slabad_(real *small, real *large) { /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -605,7 +605,7 @@ f"> */ *large = sqrt(*large); } - return 0; + return; /* End of SLABAD */ diff --git a/lapack-netlib/SRC/slabrd.c b/lapack-netlib/SRC/slabrd.c index 38a1af1965..5224ba7b87 100644 --- a/lapack-netlib/SRC/slabrd.c +++ b/lapack-netlib/SRC/slabrd.c @@ -725,7 +725,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slabrd_(integer *m, integer *n, integer *nb, real *a, +/* Subroutine */ void slabrd_(integer *m, integer *n, integer *nb, real *a, integer *lda, real *d__, real *e, real *tauq, real *taup, real *x, integer *ldx, real *y, integer *ldy) { @@ -735,7 +735,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), slarfg_( integer *, real *, real *, integer *, real *); @@ -769,7 +769,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (*m >= *n) { @@ -999,7 +999,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of SLABRD */ diff --git a/lapack-netlib/SRC/slacn2.c b/lapack-netlib/SRC/slacn2.c index a6f3aed6ae..a0d7bb4262 100644 --- a/lapack-netlib/SRC/slacn2.c +++ b/lapack-netlib/SRC/slacn2.c @@ -651,7 +651,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slacn2_(integer *n, real *v, real *x, integer *isgn, +/* Subroutine */ void slacn2_(integer *n, real *v, real *x, integer *isgn, real *est, integer *kase, integer *isave) { /* System generated locals */ @@ -662,7 +662,7 @@ f"> */ real temp; integer i__, jlast; extern real sasum_(integer *, real *, integer *); - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern integer isamax_(integer *, real *, integer *); real altsgn, estold; @@ -692,7 +692,7 @@ f"> */ } *kase = 1; isave[1] = 1; - return 0; + return; } switch (isave[1]) { @@ -723,7 +723,7 @@ f"> */ } *kase = 2; isave[1] = 2; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -743,7 +743,7 @@ f"> */ x[isave[2]] = 1.f; *kase = 1; isave[1] = 3; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -777,7 +777,7 @@ f"> */ } *kase = 2; isave[1] = 4; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 4) */ /* X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -802,7 +802,7 @@ f"> */ } *kase = 1; isave[1] = 5; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -816,7 +816,7 @@ f"> */ L150: *kase = 0; - return 0; + return; /* End of SLACN2 */ diff --git a/lapack-netlib/SRC/slacon.c b/lapack-netlib/SRC/slacon.c index b0a20acccd..ec92e00a09 100644 --- a/lapack-netlib/SRC/slacon.c +++ b/lapack-netlib/SRC/slacon.c @@ -630,7 +630,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slacon_(integer *n, real *v, real *x, integer *isgn, +/* Subroutine */ void slacon_(integer *n, real *v, real *x, integer *isgn, real *est, integer *kase) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ static real temp; static integer jump, i__, j, jlast; extern real sasum_(integer *, real *, integer *); - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern integer isamax_(integer *, real *, integer *); static real altsgn, estold; @@ -671,7 +671,7 @@ f"> */ } *kase = 1; jump = 1; - return 0; + return; } switch (jump) { @@ -702,7 +702,7 @@ f"> */ } *kase = 2; jump = 2; - return 0; + return; /* ................ ENTRY (JUMP = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -722,7 +722,7 @@ f"> */ x[j] = 1.f; *kase = 1; jump = 3; - return 0; + return; /* ................ ENTRY (JUMP = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -756,7 +756,7 @@ f"> */ } *kase = 2; jump = 4; - return 0; + return; /* ................ ENTRY (JUMP = 4) */ /* X HAS BEEN OVERWRITTEN BY TRANSPOSE(A)*X. */ @@ -781,7 +781,7 @@ f"> */ } *kase = 1; jump = 5; - return 0; + return; /* ................ ENTRY (JUMP = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -795,7 +795,7 @@ f"> */ L150: *kase = 0; - return 0; + return; /* End of SLACON */ diff --git a/lapack-netlib/SRC/slacpy.c b/lapack-netlib/SRC/slacpy.c index 61cf0f7b9c..eeb42dfa19 100644 --- a/lapack-netlib/SRC/slacpy.c +++ b/lapack-netlib/SRC/slacpy.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slacpy_(char *uplo, integer *m, integer *n, real *a, +/* Subroutine */ void slacpy_(char *uplo, integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb) { /* System generated locals */ @@ -672,7 +672,7 @@ f"> */ /* L60: */ } } - return 0; + return; /* End of SLACPY */ diff --git a/lapack-netlib/SRC/sladiv.c b/lapack-netlib/SRC/sladiv.c index 0703a90d31..5005ee1b28 100644 --- a/lapack-netlib/SRC/sladiv.c +++ b/lapack-netlib/SRC/sladiv.c @@ -602,7 +602,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int sladiv_(real *a, real *b, real *c__, real *d__, real *p, +/* Subroutine */ void sladiv_(real *a, real *b, real *c__, real *d__, real *p, real *q) { /* System generated locals */ @@ -611,7 +611,7 @@ f"> */ /* Local variables */ real s, aa, ab, bb, cc, cd, dd, be, un, ov; extern real slamch_(char *); - extern /* Subroutine */ int sladiv1_(real *, real *, real *, real *, real + extern /* Subroutine */ void sladiv1_(real *, real *, real *, real *, real *, real *); real eps; @@ -670,14 +670,14 @@ f"> */ *p *= s; *q *= s; - return 0; + return; /* End of SLADIV */ } /* sladiv_ */ /* > \ingroup realOTHERauxiliary */ -/* Subroutine */ int sladiv1_(real *a, real *b, real *c__, real *d__, real *p, +/* Subroutine */ void sladiv1_(real *a, real *b, real *c__, real *d__, real *p, real *q) { real r__, t; @@ -700,7 +700,7 @@ f"> */ *a = -(*a); *q = sladiv2_(b, a, c__, d__, &r__, &t); - return 0; + return; /* End of SLADIV1 */ diff --git a/lapack-netlib/SRC/slae2.c b/lapack-netlib/SRC/slae2.c index 6e4fdb6fa9..b97fe55829 100644 --- a/lapack-netlib/SRC/slae2.c +++ b/lapack-netlib/SRC/slae2.c @@ -613,7 +613,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slae2_(real *a, real *b, real *c__, real *rt1, real *rt2) +/* Subroutine */ void slae2_(real *a, real *b, real *c__, real *rt1, real *rt2) { /* System generated locals */ real r__1; @@ -682,7 +682,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rt1 = rt * .5f; *rt2 = rt * -.5f; } - return 0; + return; /* End of SLAE2 */ diff --git a/lapack-netlib/SRC/slaebz.c b/lapack-netlib/SRC/slaebz.c index 386a185a6f..24264032c2 100644 --- a/lapack-netlib/SRC/slaebz.c +++ b/lapack-netlib/SRC/slaebz.c @@ -827,7 +827,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaebz_(integer *ijob, integer *nitmax, integer *n, +/* Subroutine */ void slaebz_(integer *ijob, integer *nitmax, integer *n, integer *mmax, integer *minp, integer *nbmin, real *abstol, real * reltol, real *pivmin, real *d__, real *e, real *e2, integer *nval, real *ab, real *c__, integer *mout, integer *nab, real *work, integer @@ -873,7 +873,7 @@ f"> */ *info = 0; if (*ijob < 1 || *ijob > 3) { *info = -1; - return 0; + return; } /* Initialize NAB */ @@ -911,7 +911,7 @@ f"> */ *mout = *mout + nab[ji + (nab_dim1 << 1)] - nab[ji + nab_dim1]; /* L30: */ } - return 0; + return; } /* Initialize for loop */ @@ -1027,7 +1027,7 @@ f"> */ /* L70: */ } if (*info != 0) { - return 0; + return; } kl = klnew; } else { @@ -1125,7 +1125,7 @@ f"> */ nab[ji + (nab_dim1 << 1)] = itmp1; } else { *info = *mmax + 1; - return 0; + return; } } else { @@ -1215,7 +1215,7 @@ f"> */ *info = f2cmax(i__1,0); *mout = kl; - return 0; + return; /* End of SLAEBZ */ diff --git a/lapack-netlib/SRC/slaed0.c b/lapack-netlib/SRC/slaed0.c index 33f7134c1a..930102b9f4 100644 --- a/lapack-netlib/SRC/slaed0.c +++ b/lapack-netlib/SRC/slaed0.c @@ -690,7 +690,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slaed0_(integer *icompq, integer *qsiz, integer *n, real +/* Subroutine */ void slaed0_(integer *icompq, integer *qsiz, integer *n, real *d__, real *e, real *q, integer *ldq, real *qstore, integer *ldqs, real *work, integer *iwork, integer *info) { @@ -701,14 +701,14 @@ f"> */ /* Local variables */ real temp; integer curr, i__, j, k; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer iperm, indxq, iwrem; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer iqptr, tlvls; - extern /* Subroutine */ int slaed1_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slaed1_(integer *, real *, real *, integer *, integer *, real *, integer *, real *, integer *, integer *), slaed7_(integer *, integer *, integer *, integer *, integer *, integer *, real *, real *, integer *, integer *, real *, integer * @@ -719,10 +719,10 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer igivnm, submat; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer curprb, subpbs, igivpt, curlvl, matsiz, iprmpt, smlsiz; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); integer lgn, msd2, smm1, spm1, spm2; @@ -767,13 +767,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED0", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } smlsiz = ilaenv_(&c__9, "SLAED0", " ", &c__0, &c__0, &c__0, &c__0, ( @@ -823,10 +823,10 @@ f"> */ temp = log((real) (*n)) / log(2.f); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; @@ -992,7 +992,7 @@ f"> */ *info = submat * (*n + 1) + submat + matsiz - 1; L140: - return 0; + return; /* End of SLAED0 */ diff --git a/lapack-netlib/SRC/slaed1.c b/lapack-netlib/SRC/slaed1.c index 693e82ec80..d03d2425f7 100644 --- a/lapack-netlib/SRC/slaed1.c +++ b/lapack-netlib/SRC/slaed1.c @@ -677,7 +677,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaed1_(integer *n, real *d__, real *q, integer *ldq, +/* Subroutine */ void slaed1_(integer *n, real *d__, real *q, integer *ldq, integer *indxq, real *rho, integer *cutpnt, real *work, integer * iwork, integer *info) { @@ -686,17 +686,18 @@ f"> */ /* Local variables */ integer indx, i__, k, indxc, indxp; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n1, n2; - extern /* Subroutine */ int slaed2_(integer *, integer *, integer *, real + extern /* Subroutine */ void slaed2_(integer *, integer *, integer *, real *, real *, integer *, integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, integer *), slaed3_(integer *, integer *, integer *, real *, real *, integer * , real *, real *, real *, integer *, integer *, real *, real *, integer *); integer idlmda, is, iw, iz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slamrg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slamrg_( integer *, integer *, real *, integer *, integer *, integer *); integer coltyp, iq2, cpp1; @@ -738,13 +739,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED1", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* The following values are integer pointers which indicate */ @@ -806,7 +807,7 @@ f"> */ } L20: - return 0; + return; /* End of SLAED1 */ diff --git a/lapack-netlib/SRC/slaed2.c b/lapack-netlib/SRC/slaed2.c index f4b9f61d04..0093a70531 100644 --- a/lapack-netlib/SRC/slaed2.c +++ b/lapack-netlib/SRC/slaed2.c @@ -726,7 +726,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaed2_(integer *k, integer *n, integer *n1, real *d__, +/* Subroutine */ void slaed2_(integer *k, integer *n, integer *n1, real *d__, real *q, integer *ldq, integer *indxq, real *rho, real *z__, real * dlamda, real *w, real *q2, integer *indx, integer *indxc, integer * indxp, integer *coltyp, integer *info) @@ -737,14 +737,14 @@ f"> */ /* Local variables */ integer imax, jmax, ctot[4]; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real c__; integer i__, j; real s, t; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer k2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n2; extern real slapy2_(real *, real *); @@ -752,7 +752,7 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slamrg_(integer *, integer *, real *, integer + extern /* Subroutine */ void slamrg_(integer *, integer *, real *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer iq1, iq2, n1p1; @@ -803,13 +803,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } n2 = *n - *n1; @@ -1110,7 +1110,7 @@ f"> */ } L190: - return 0; + return; /* End of SLAED2 */ diff --git a/lapack-netlib/SRC/slaed3.c b/lapack-netlib/SRC/slaed3.c index 2acd3ba194..f3fb1aaf7d 100644 --- a/lapack-netlib/SRC/slaed3.c +++ b/lapack-netlib/SRC/slaed3.c @@ -700,7 +700,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaed3_(integer *k, integer *n, integer *n1, real *d__, +/* Subroutine */ void slaed3_(integer *k, integer *n, integer *n1, real *d__, real *q, integer *ldq, real *rho, real *dlamda, real *q2, integer * indx, integer *ctot, real *w, real *s, integer *info) { @@ -712,16 +712,17 @@ f"> */ real temp; extern real snrm2_(integer *, real *, integer *); integer i__, j; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *); integer n2; - extern /* Subroutine */ int slaed4_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slaed4_(integer *, integer *, real *, real *, real *, real *, real *, integer *); extern real slamc3_(real *, real *); integer n12, ii, n23; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); @@ -764,13 +765,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*k == 0) { - return 0; + return; } /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ @@ -901,7 +902,7 @@ f"> */ L120: - return 0; + return; /* End of SLAED3 */ diff --git a/lapack-netlib/SRC/slaed4.c b/lapack-netlib/SRC/slaed4.c index c6179d513a..009b5095f9 100644 --- a/lapack-netlib/SRC/slaed4.c +++ b/lapack-netlib/SRC/slaed4.c @@ -654,7 +654,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaed4_(integer *n, integer *i__, real *d__, real *z__, +/* Subroutine */ void slaed4_(integer *n, integer *i__, real *d__, real *z__, real *delta, real *rho, real *dlam, integer *info) { /* System generated locals */ @@ -669,7 +669,7 @@ f"> */ real w, dltlb, dltub, midpt; integer niter; logical swtch; - extern /* Subroutine */ int slaed5_(integer *, real *, real *, real *, + extern /* Subroutine */ void slaed5_(integer *, real *, real *, real *, real *, real *), slaed6_(integer *, logical *, real *, real *, real *, real *, real *, integer *); logical swtch3; @@ -711,11 +711,11 @@ f"> */ *dlam = d__[1] + *rho * z__[1] * z__[1]; delta[1] = 1.f; - return 0; + return; } if (*n == 2) { slaed5_(i__, &d__[1], &z__[1], &delta[1], rho, dlam); - return 0; + return; } /* Compute machine epsilon */ @@ -1494,7 +1494,7 @@ f"> */ L250: - return 0; + return; /* End of SLAED4 */ diff --git a/lapack-netlib/SRC/slaed4.f b/lapack-netlib/SRC/slaed4.f index f056746d80..339c5029cc 100644 --- a/lapack-netlib/SRC/slaed4.f +++ b/lapack-netlib/SRC/slaed4.f @@ -328,9 +328,12 @@ SUBROUTINE SLAED4( N, I, D, Z, DELTA, RHO, DLAM, INFO ) IF( C.LT.ZERO ) $ C = ABS( C ) IF( C.EQ.ZERO ) THEN -* ETA = B/A +* ETA = B/A * ETA = RHO - TAU - ETA = DLTUB - TAU +* ETA = DLTUB - TAU +* +* Update proposed by Li, Ren-Cang: + ETA = -W / ( DPSI+DPHI ) ELSE IF( A.GE.ZERO ) THEN ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C ) ELSE diff --git a/lapack-netlib/SRC/slaed5.c b/lapack-netlib/SRC/slaed5.c index cf6b0b1e14..dc8417ef30 100644 --- a/lapack-netlib/SRC/slaed5.c +++ b/lapack-netlib/SRC/slaed5.c @@ -617,7 +617,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaed5_(integer *i__, real *d__, real *z__, real *delta, +/* Subroutine */ void slaed5_(integer *i__, real *d__, real *z__, real *delta, real *rho, real *dlam) { /* System generated locals */ @@ -689,7 +689,7 @@ f"> */ delta[1] /= temp; delta[2] /= temp; } - return 0; + return; /* End OF SLAED5 */ diff --git a/lapack-netlib/SRC/slaed6.c b/lapack-netlib/SRC/slaed6.c index 96ede1af0e..ceeaad0b70 100644 --- a/lapack-netlib/SRC/slaed6.c +++ b/lapack-netlib/SRC/slaed6.c @@ -649,7 +649,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaed6_(integer *kniter, logical *orgati, real *rho, +/* Subroutine */ void slaed6_(integer *kniter, logical *orgati, real *rho, real *d__, real *z__, real *finit, real *tau, integer *info) { /* System generated locals */ @@ -926,7 +926,7 @@ f"> */ if (scale) { *tau *= sclinv; } - return 0; + return; /* End of SLAED6 */ diff --git a/lapack-netlib/SRC/slaed7.c b/lapack-netlib/SRC/slaed7.c index 210d796d1f..b1fb60a0aa 100644 --- a/lapack-netlib/SRC/slaed7.c +++ b/lapack-netlib/SRC/slaed7.c @@ -775,7 +775,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slaed7_(integer *icompq, integer *n, integer *qsiz, +/* Subroutine */ void slaed7_(integer *icompq, integer *n, integer *qsiz, integer *tlvls, integer *curlvl, integer *curpbm, real *d__, real *q, integer *ldq, integer *indxq, real *rho, integer *cutpnt, real * qstore, integer *qptr, integer *prmptr, integer *perm, integer * @@ -787,11 +787,11 @@ f"> */ /* Local variables */ integer indx, curr, i__, k, indxc; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer indxp, n1, n2; - extern /* Subroutine */ int slaed8_(integer *, integer *, integer *, + extern /* Subroutine */ void slaed8_(integer *, integer *, integer *, integer *, real *, real *, integer *, integer *, real *, integer * , real *, real *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *, integer *, integer *), slaed9_( @@ -801,7 +801,8 @@ f"> */ integer *, integer *, integer *, real *, real *, integer *, real * , real *, integer *); integer idlmda, is, iw, iz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slamrg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slamrg_( integer *, integer *, real *, integer *, integer *, integer *); integer coltyp, iq2, ptr, ldq2; @@ -850,13 +851,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED7", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* The following values are for bookkeeping purposes only. They are */ @@ -883,11 +884,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; @@ -946,7 +947,7 @@ f"> */ } L30: - return 0; + return; /* End of SLAED7 */ diff --git a/lapack-netlib/SRC/slaed8.c b/lapack-netlib/SRC/slaed8.c index 9b23e69932..d060c9755b 100644 --- a/lapack-netlib/SRC/slaed8.c +++ b/lapack-netlib/SRC/slaed8.c @@ -756,7 +756,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slaed8_(integer *icompq, integer *k, integer *n, integer +/* Subroutine */ void slaed8_(integer *icompq, integer *k, integer *n, integer *qsiz, real *d__, real *q, integer *ldq, integer *indxq, real *rho, integer *cutpnt, real *z__, real *dlamda, real *q2, integer *ldq2, real *w, integer *perm, integer *givptr, integer *givcol, real * @@ -768,14 +768,14 @@ f"> */ /* Local variables */ integer jlam, imax, jmax; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real c__; integer i__, j; real s, t; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer k2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n1, n2; extern real slapy2_(real *, real *); @@ -783,7 +783,7 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slamrg_(integer *, integer *, real *, integer + extern /* Subroutine */ void slamrg_(integer *, integer *, real *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer n1p1; @@ -839,7 +839,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED8", &i__1, (ftnlen)6); - return 0; + return; } /* Need to initialize GIVPTR to O here in case of quick exit */ @@ -852,7 +852,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } n1 = *cutpnt; @@ -926,7 +926,7 @@ f"> */ } slacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq); } - return 0; + return; } /* If there are multiple eigenvalues then the problem deflates. Here */ @@ -1077,7 +1077,7 @@ f"> */ } } - return 0; + return; /* End of SLAED8 */ diff --git a/lapack-netlib/SRC/slaed9.c b/lapack-netlib/SRC/slaed9.c index c6c6d89c41..6de1a9db6e 100644 --- a/lapack-netlib/SRC/slaed9.c +++ b/lapack-netlib/SRC/slaed9.c @@ -669,7 +669,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slaed9_(integer *k, integer *kstart, integer *kstop, +/* Subroutine */ void slaed9_(integer *k, integer *kstart, integer *kstop, integer *n, real *d__, real *q, integer *ldq, real *rho, real *dlamda, real *w, real *s, integer *lds, integer *info) { @@ -681,7 +681,7 @@ f"> */ real temp; extern real snrm2_(integer *, real *, integer *); integer i__, j; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slaed4_(integer *, integer *, real *, real *, real *, real *, real *, integer *); extern real slamc3_(real *, real *); @@ -729,13 +729,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAED9", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*k == 0) { - return 0; + return; } /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ @@ -835,7 +835,7 @@ f"> */ } L120: - return 0; + return; /* End of SLAED9 */ diff --git a/lapack-netlib/SRC/slaeda.c b/lapack-netlib/SRC/slaeda.c index 7edaf8a76e..fdf2036dbc 100644 --- a/lapack-netlib/SRC/slaeda.c +++ b/lapack-netlib/SRC/slaeda.c @@ -682,7 +682,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slaeda_(integer *n, integer *tlvls, integer *curlvl, +/* Subroutine */ void slaeda_(integer *n, integer *tlvls, integer *curlvl, integer *curpbm, integer *prmptr, integer *perm, integer *givptr, integer *givcol, real *givnum, real *q, integer *qptr, real *z__, real *ztemp, integer *info) @@ -692,12 +692,12 @@ f"> */ /* Local variables */ integer curr; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer bsiz1, bsiz2, psiz1, psiz2, i__, k, zptr1; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer mid, ptr; @@ -732,13 +732,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAEDA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine location of first number in second half. */ @@ -753,7 +753,7 @@ f"> */ /* scheme */ i__1 = *curlvl - 1; - curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1; + curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1; /* Determine size of these matrices. We add HALF to the value of */ /* the SQRT in case the machine underestimates one of these square */ @@ -779,12 +779,12 @@ f"> */ /* rotations and permutation and then multiplying the center matrices */ /* against the current Z. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (k = 1; k <= i__1; ++k) { i__2 = *curlvl - k; i__3 = *curlvl - k - 1; - curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - + curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 1; psiz1 = prmptr[curr + 1] - prmptr[curr]; psiz2 = prmptr[curr + 2] - prmptr[curr + 1]; @@ -844,11 +844,11 @@ f"> */ c__1); i__2 = *tlvls - k; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L70: */ } - return 0; + return; /* End of SLAEDA */ diff --git a/lapack-netlib/SRC/slaein.c b/lapack-netlib/SRC/slaein.c index 86fc52736b..e05629ee7c 100644 --- a/lapack-netlib/SRC/slaein.c +++ b/lapack-netlib/SRC/slaein.c @@ -685,7 +685,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaein_(logical *rightv, logical *noinit, integer *n, +/* Subroutine */ void slaein_(logical *rightv, logical *noinit, integer *n, real *h__, integer *ldh, real *wr, real *wi, real *vr, real *vi, real *b, integer *ldb, real *work, real *eps3, real *smlnum, real *bignum, integer *info) @@ -700,7 +700,7 @@ f"> */ extern real snrm2_(integer *, real *, integer *); integer i__, j; real scale, w, x, y; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char trans[1]; real vcrit; extern real sasum_(integer *, real *, integer *); @@ -709,11 +709,11 @@ f"> */ extern real slapy2_(real *, real *); real ei, ej, absbii, absbjj, xi, xr; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int sladiv_(real *, real *, real *, real *, real * + extern /* Subroutine */ void sladiv_(real *, real *, real *, real *, real * , real *); char normin[1]; real nrmsml; - extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatrs_(char *, char *, char *, char *, integer *, real *, integer *, real *, real *, real *, integer *); real growto, rec; integer its; @@ -1250,7 +1250,7 @@ f"> */ } - return 0; + return; /* End of SLAEIN */ diff --git a/lapack-netlib/SRC/slaev2.c b/lapack-netlib/SRC/slaev2.c index 26155dd874..7500e82af6 100644 --- a/lapack-netlib/SRC/slaev2.c +++ b/lapack-netlib/SRC/slaev2.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaev2_(real *a, real *b, real *c__, real *rt1, real * +/* Subroutine */ void slaev2_(real *a, real *b, real *c__, real *rt1, real * rt2, real *cs1, real *sn1) { /* System generated locals */ @@ -735,7 +735,7 @@ f"> */ *cs1 = -(*sn1); *sn1 = tn; } - return 0; + return; /* End of SLAEV2 */ diff --git a/lapack-netlib/SRC/slaexc.c b/lapack-netlib/SRC/slaexc.c index 21402599f9..553c4957bd 100644 --- a/lapack-netlib/SRC/slaexc.c +++ b/lapack-netlib/SRC/slaexc.c @@ -656,7 +656,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaexc_(logical *wantq, integer *n, real *t, integer * +/* Subroutine */ void slaexc_(logical *wantq, integer *n, real *t, integer * ldt, real *q, integer *ldq, integer *j1, integer *n1, integer *n2, real *work, integer *info) { @@ -667,14 +667,14 @@ f"> */ /* Local variables */ integer ierr; real temp; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real d__[16] /* was [4][4] */; integer k; real u[3], scale, x[4] /* was [2][2] */, dnorm; integer j2, j3, j4; real xnorm, u1[3], u2[3]; - extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slanv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *, real *), slasy2_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, real *, integer *, @@ -683,12 +683,12 @@ f"> */ real cs, t11, t22, t33, sn; extern real slamch_(char *), slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slartg_(real *, real *, real *, real * , real *); real thresh; - extern /* Subroutine */ int slarfx_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarfx_(char *, integer *, integer *, real *, real *, real *, integer *, real *); real smlnum, wi1, wi2, wr1, wr2, eps, tau, tau1, tau2; @@ -717,10 +717,10 @@ f"> */ /* Quick return if possible */ if (*n == 0 || *n1 == 0 || *n2 == 0) { - return 0; + return; } if (*j1 + *n1 > *n) { - return 0; + return; } j2 = *j1 + 1; @@ -1006,13 +1006,13 @@ f"> */ } } - return 0; + return; /* Exit with INFO = 1 if swap was rejected. */ L50: *info = 1; - return 0; + return; /* End of SLAEXC */ diff --git a/lapack-netlib/SRC/slag2.c b/lapack-netlib/SRC/slag2.c index c052cf7601..c012c0be2f 100644 --- a/lapack-netlib/SRC/slag2.c +++ b/lapack-netlib/SRC/slag2.c @@ -665,7 +665,7 @@ ssary to avoid over-/underflow. */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slag2_(real *a, integer *lda, real *b, integer *ldb, +/* Subroutine */ void slag2_(real *a, integer *lda, real *b, integer *ldb, real *safmin, real *scale1, real *scale2, real *wr1, real *wr2, real * wi) { @@ -914,6 +914,6 @@ ssary to avoid over-/underflow. */ /* End of SLAG2 */ - return 0; + return; } /* slag2_ */ diff --git a/lapack-netlib/SRC/slag2d.c b/lapack-netlib/SRC/slag2d.c index 4976f5b6d1..9d47535986 100644 --- a/lapack-netlib/SRC/slag2d.c +++ b/lapack-netlib/SRC/slag2d.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slag2d_(integer *m, integer *n, real *sa, integer *ldsa, +/* Subroutine */ void slag2d_(integer *m, integer *n, real *sa, integer *ldsa, doublereal *a, integer *lda, integer *info) { /* System generated locals */ @@ -651,7 +651,7 @@ f"> */ } /* L20: */ } - return 0; + return; /* End of SLAG2D */ diff --git a/lapack-netlib/SRC/slags2.c b/lapack-netlib/SRC/slags2.c index 785ba50fef..1d18f4f57c 100644 --- a/lapack-netlib/SRC/slags2.c +++ b/lapack-netlib/SRC/slags2.c @@ -663,7 +663,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slags2_(logical *upper, real *a1, real *a2, real *a3, +/* Subroutine */ void slags2_(logical *upper, real *a1, real *a2, real *a3, real *b1, real *b2, real *b3, real *csu, real *snu, real *csv, real * snv, real *csq, real *snq) { @@ -673,7 +673,7 @@ f"> */ /* Local variables */ real aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, ua11r, ua22r, vb11r, vb22r, a, b, c__, d__, r__, s1, s2; - extern /* Subroutine */ int slasv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slasv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *), slartg_(real *, real *, real *, real *, real *); real ua11, ua12, ua21, ua22, vb11, vb12, vb21, vb22, csl, csr, snl, snr; @@ -864,7 +864,7 @@ f"> */ } - return 0; + return; /* End of SLAGS2 */ diff --git a/lapack-netlib/SRC/slagtf.c b/lapack-netlib/SRC/slagtf.c index a162e7251e..993b6e4f0f 100644 --- a/lapack-netlib/SRC/slagtf.c +++ b/lapack-netlib/SRC/slagtf.c @@ -666,7 +666,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slagtf_(integer *n, real *a, real *lambda, real *b, real +/* Subroutine */ void slagtf_(integer *n, real *a, real *lambda, real *b, real *c__, real *tol, real *d__, integer *in, integer *info) { /* System generated locals */ @@ -704,11 +704,11 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("SLAGTF", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } a[1] -= *lambda; @@ -717,7 +717,7 @@ f"> */ if (a[1] == 0.f) { in[1] = 1; } - return 0; + return; } eps = slamch_("Epsilon"); @@ -776,7 +776,7 @@ f"> */ in[*n] = *n; } - return 0; + return; /* End of SLAGTF */ diff --git a/lapack-netlib/SRC/slagtm.c b/lapack-netlib/SRC/slagtm.c index 45d6371132..589f0723bd 100644 --- a/lapack-netlib/SRC/slagtm.c +++ b/lapack-netlib/SRC/slagtm.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slagtm_(char *trans, integer *n, integer *nrhs, real * +/* Subroutine */ void slagtm_(char *trans, integer *n, integer *nrhs, real * alpha, real *dl, real *d__, real *du, real *x, integer *ldx, real * beta, real *b, integer *ldb) { @@ -688,7 +688,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } /* Multiply B by BETA if BETA.NE.1. */ @@ -820,7 +820,7 @@ f"> */ } } } - return 0; + return; /* End of SLAGTM */ diff --git a/lapack-netlib/SRC/slagts.c b/lapack-netlib/SRC/slagts.c index 1fe08865d1..f16f318622 100644 --- a/lapack-netlib/SRC/slagts.c +++ b/lapack-netlib/SRC/slagts.c @@ -671,7 +671,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slagts_(integer *job, integer *n, real *a, real *b, real +/* Subroutine */ void slagts_(integer *job, integer *n, real *a, real *b, real *c__, real *d__, integer *in, real *y, real *tol, integer *info) { /* System generated locals */ @@ -714,11 +714,11 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAGTS", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } eps = slamch_("Epsilon"); @@ -777,14 +777,14 @@ f"> */ if (absak < sfmin) { if (absak == 0.f || abs(temp) * sfmin > absak) { *info = k; - return 0; + return; } else { temp *= bignum; ak *= bignum; } } else if (abs(temp) > absak * bignum) { *info = k; - return 0; + return; } } y[k] = temp / ak; @@ -843,14 +843,14 @@ f"> */ if (absak < sfmin) { if (absak == 0.f || abs(temp) * sfmin > absak) { *info = k; - return 0; + return; } else { temp *= bignum; ak *= bignum; } } else if (abs(temp) > absak * bignum) { *info = k; - return 0; + return; } } y[k] = temp / ak; @@ -905,6 +905,6 @@ f"> */ /* End of SLAGTS */ - return 0; + return; } /* slagts_ */ diff --git a/lapack-netlib/SRC/slagv2.c b/lapack-netlib/SRC/slagv2.c index 3ebcdd4766..9eed349785 100644 --- a/lapack-netlib/SRC/slagv2.c +++ b/lapack-netlib/SRC/slagv2.c @@ -671,7 +671,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int slagv2_(real *a, integer *lda, real *b, integer *ldb, +/* Subroutine */ void slagv2_(real *a, integer *lda, real *b, integer *ldb, real *alphar, real *alphai, real *beta, real *csl, real *snl, real * csr, real *snr) { @@ -680,17 +680,17 @@ f"> */ real r__1, r__2, r__3, r__4, r__5, r__6; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *), slag2_(real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *); real r__, t, anorm, bnorm, h1, h2, h3, scale1, scale2; - extern /* Subroutine */ int slasv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slasv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *); extern real slapy2_(real *, real *); real ascale, bscale, wi, qq, rr; extern real slamch_(char *); real safmin; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ); real wr1, wr2, ulp; @@ -909,7 +909,7 @@ f"> */ beta[2] = 1.f; } - return 0; + return; /* End of SLAGV2 */ diff --git a/lapack-netlib/SRC/slahqr.c b/lapack-netlib/SRC/slahqr.c index a2aa97858f..99cdc25805 100644 --- a/lapack-netlib/SRC/slahqr.c +++ b/lapack-netlib/SRC/slahqr.c @@ -720,7 +720,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slahqr_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void slahqr_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real * wi, integer *iloz, integer *ihiz, real *z__, integer *ldz, integer * info) @@ -730,26 +730,26 @@ f"> */ real r__1, r__2, r__3, r__4; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer i__, j, k, l, m; real s, v[3]; integer itmax, i1, i2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real t1, t2, t3, v2, v3, aa, ab, ba, bb; - extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slanv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *, real *); real h11, h12, h21, h22, cs; integer nh; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real sn; integer nr; real tr; extern real slamch_(char *); integer nz; real safmin; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); real safmax, rtdisc, smlnum, det, h21s; integer its; @@ -781,12 +781,12 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*ilo == *ihi) { wr[*ilo] = h__[*ilo + *ilo * h_dim1]; wi[*ilo] = 0.f; - return 0; + return; } /* ==== clear out the trash ==== */ @@ -1155,7 +1155,7 @@ f"> */ /* Failure to converge in remaining number of iterations */ *info = i__; - return 0; + return; L150: @@ -1205,7 +1205,7 @@ f"> */ goto L20; L160: - return 0; + return; /* End of SLAHQR */ diff --git a/lapack-netlib/SRC/slahr2.c b/lapack-netlib/SRC/slahr2.c index 90247fbe45..ed697ff093 100644 --- a/lapack-netlib/SRC/slahr2.c +++ b/lapack-netlib/SRC/slahr2.c @@ -700,7 +700,7 @@ f"> */ /* > Mathematical Software, 32(2):180-194, June 2006. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slahr2_(integer *n, integer *k, integer *nb, real *a, +/* Subroutine */ void slahr2_(integer *n, integer *k, integer *nb, real *a, integer *lda, real *tau, real *t, integer *ldt, real *y, integer *ldy) { /* System generated locals */ @@ -710,7 +710,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), @@ -719,7 +719,7 @@ f"> */ integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); real ei; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); @@ -749,7 +749,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -877,7 +877,7 @@ f"> */ strmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b5, &t[ t_offset], ldt, &y[y_offset], ldy); - return 0; + return; /* End of SLAHR2 */ diff --git a/lapack-netlib/SRC/slaic1.c b/lapack-netlib/SRC/slaic1.c index c6534c0b72..6307e19e95 100644 --- a/lapack-netlib/SRC/slaic1.c +++ b/lapack-netlib/SRC/slaic1.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaic1_(integer *job, integer *j, real *x, real *sest, +/* Subroutine */ void slaic1_(integer *job, integer *j, real *x, real *sest, real *w, real *gamma, real *sestpr, real *s, real *c__) { /* System generated locals */ @@ -703,7 +703,7 @@ f"> */ *c__ /= tmp; *sestpr = s1 * tmp; } - return 0; + return; } else if (absgam <= eps * absest) { *s = 1.f; *c__ = 0.f; @@ -711,7 +711,7 @@ f"> */ s1 = absest / tmp; s2 = absalp / tmp; *sestpr = tmp * sqrt(s1 * s1 + s2 * s2); - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -724,7 +724,7 @@ f"> */ *c__ = 1.f; *sestpr = s1; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -741,7 +741,7 @@ f"> */ *s = alpha / s1 / *c__; *c__ = r_sign(&c_b5, gamma) / *c__; } - return 0; + return; } else { /* normal case */ @@ -763,7 +763,7 @@ f"> */ *s = sine / tmp; *c__ = cosine / tmp; *sestpr = sqrt(t + 1.f) * absest; - return 0; + return; } } else if (*job == 2) { @@ -789,12 +789,12 @@ f"> */ tmp = sqrt(*s * *s + *c__ * *c__); *s /= tmp; *c__ /= tmp; - return 0; + return; } else if (absgam <= eps * absest) { *s = 0.f; *c__ = 1.f; *sestpr = absgam; - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -807,7 +807,7 @@ f"> */ *c__ = 0.f; *sestpr = s2; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -824,7 +824,7 @@ f"> */ *c__ = alpha / s1 / *s; *s = -r_sign(&c_b5, gamma) / *s; } - return 0; + return; } else { /* normal case */ @@ -868,11 +868,11 @@ f"> */ tmp = sqrt(sine * sine + cosine * cosine); *s = sine / tmp; *c__ = cosine / tmp; - return 0; + return; } } - return 0; + return; /* End of SLAIC1 */ diff --git a/lapack-netlib/SRC/slaln2.c b/lapack-netlib/SRC/slaln2.c index a4e9db6fe7..68465aced5 100644 --- a/lapack-netlib/SRC/slaln2.c +++ b/lapack-netlib/SRC/slaln2.c @@ -726,7 +726,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaln2_(logical *ltrans, integer *na, integer *nw, real * +/* Subroutine */ void slaln2_(logical *ltrans, integer *na, integer *nw, real * smin, real *ca, real *a, integer *lda, real *d1, real *d2, real *b, integer *ldb, real *wr, real *wi, real *x, integer *ldx, real *scale, real *xnorm, integer *info) @@ -753,7 +753,7 @@ f"> */ #define cr (equiv_1) extern real slamch_(char *); real bignum; - extern /* Subroutine */ int sladiv_(real *, real *, real *, real *, real * + extern /* Subroutine */ void sladiv_(real *, real *, real *, real *, real * , real *); real bi1, bi2, br1, br2, smlnum, xi1, xi2, xr1, xr2, ci21, ci22, cr21, cr22, li21, csi, ui11, lr21, ui12, ui22; @@ -920,7 +920,7 @@ f"> */ x[x_dim1 + 2] = temp * b[b_dim1 + 2]; *xnorm = temp * bnorm; *info = 1; - return 0; + return; } /* Gaussian elimination with complete pivoting. */ @@ -1023,7 +1023,7 @@ f"> */ x[(x_dim1 << 1) + 2] = temp * b[(b_dim1 << 1) + 2]; *xnorm = temp * bnorm; *info = 1; - return 0; + return; } /* Gaussian elimination with complete pivoting. */ @@ -1142,7 +1142,7 @@ f"> */ } } - return 0; + return; /* End of SLALN2 */ diff --git a/lapack-netlib/SRC/slals0.c b/lapack-netlib/SRC/slals0.c index 068681c28c..0fa849983b 100644 --- a/lapack-netlib/SRC/slals0.c +++ b/lapack-netlib/SRC/slals0.c @@ -784,7 +784,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int slals0_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void slals0_(integer *icompq, integer *nl, integer *nr, integer *sqre, integer *nrhs, real *b, integer *ldb, real *bx, integer *ldbx, integer *perm, integer *givptr, integer *givcol, integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real * @@ -799,12 +799,12 @@ f"> */ /* Local variables */ real temp; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); extern real snrm2_(integer *, real *, integer *); integer i__, j, m, n; real diflj, difrj, dsigj; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_( integer *, real *, integer *, real *, integer *); @@ -812,7 +812,7 @@ f"> */ real dj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real dsigjp; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer nlp1; @@ -884,7 +884,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLALS0", &i__1, (ftnlen)6); - return 0; + return; } m = n + *sqre; @@ -1068,7 +1068,7 @@ f"> */ } } - return 0; + return; /* End of SLALS0 */ diff --git a/lapack-netlib/SRC/slalsa.c b/lapack-netlib/SRC/slalsa.c index 53da2c7bf9..2eba239f4b 100644 --- a/lapack-netlib/SRC/slalsa.c +++ b/lapack-netlib/SRC/slalsa.c @@ -779,7 +779,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int slalsa_(integer *icompq, integer *smlsiz, integer *n, +/* Subroutine */ void slalsa_(integer *icompq, integer *smlsiz, integer *n, integer *nrhs, real *b, integer *ldb, real *bx, integer *ldbx, real * u, integer *ldu, real *vt, integer *k, real *difl, real *difr, real * z__, real *poles, integer *givptr, integer *givcol, integer *ldgcol, @@ -795,17 +795,18 @@ f"> */ /* Local variables */ integer nlvl, sqre, i__, j, inode, ndiml; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer ndimr, i1; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slals0_(integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, real * , real *, real *, integer *, real *, real *, real *, integer *); integer ic, lf, nd, ll, nl, nr; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slasdt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slasdt_( integer *, integer *, integer *, integer *, integer *, integer *, integer *); integer im1, nlf, nrf, lvl, ndb1, nlp1, lvl2, nrp1; @@ -886,7 +887,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLALSA", &i__1, (ftnlen)6); - return 0; + return; } /* Book-keeping and setting up the computation tree. */ @@ -946,7 +947,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -960,7 +961,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1005,7 +1006,7 @@ f"> */ ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; @@ -1062,7 +1063,7 @@ f"> */ L90: - return 0; + return; /* End of SLALSA */ diff --git a/lapack-netlib/SRC/slalsd.c b/lapack-netlib/SRC/slalsd.c index da25bf51c7..de6066e585 100644 --- a/lapack-netlib/SRC/slalsd.c +++ b/lapack-netlib/SRC/slalsd.c @@ -694,7 +694,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int slalsd_(char *uplo, integer *smlsiz, integer *n, integer +/* Subroutine */ void slalsd_(char *uplo, integer *smlsiz, integer *n, integer *nrhs, real *d__, real *e, real *b, integer *ldb, real *rcond, integer *rank, real *work, integer *iwork, integer *info) { @@ -706,16 +706,16 @@ f"> */ integer difl, difr; real rcnd; integer perm, nsub, nlvl, sqre, bxst; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer c__, i__, j, k; real r__; integer s, u, z__; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer poles, sizei, nsize; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer nwork, icmpq1, icmpq2; real cs; @@ -723,12 +723,13 @@ f"> */ real sn; integer st; extern real slamch_(char *); - extern /* Subroutine */ int slasda_(integer *, integer *, integer *, + extern /* Subroutine */ void slasda_(integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, real *, real *, real *, real *, integer *, integer *); integer vt; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slalsa_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slalsa_( integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, integer * @@ -737,7 +738,7 @@ f"> */ , real *, integer *, integer *); integer givcol; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void slasdq_(char *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real * , integer *, real *, integer *, real *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, @@ -747,7 +748,7 @@ f"> */ real orgnrm; integer givnum; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); integer givptr, nm1, smlszp, st1; real eps; integer iwk; @@ -787,7 +788,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLALSD", &i__1, (ftnlen)6); - return 0; + return; } eps = slamch_("Epsilon"); @@ -805,7 +806,7 @@ f"> */ /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } else if (*n == 1) { if (d__[1] == 0.f) { slaset_("A", &c__1, nrhs, &c_b6, &c_b6, &b[b_offset], ldb); @@ -815,7 +816,7 @@ f"> */ b_offset], ldb, info); d__[1] = abs(d__[1]); } - return 0; + return; } /* Rotate the matrix if it is lower bidiagonal. */ @@ -858,7 +859,7 @@ f"> */ orgnrm = slanst_("M", n, &d__[1], &e[1]); if (orgnrm == 0.f) { slaset_("A", n, nrhs, &c_b6, &c_b6, &b[b_offset], ldb); - return 0; + return; } slascl_("G", &c__0, &c__0, &orgnrm, &c_b11, n, &c__1, &d__[1], n, info); @@ -874,7 +875,7 @@ f"> */ slasdq_("U", &c__0, n, n, &c__0, nrhs, &d__[1], &e[1], &work[1], n, & work[1], n, &b[b_offset], ldb, &work[nwork], info); if (*info != 0) { - return 0; + return; } tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], abs(r__1)); i__1 = *n; @@ -900,7 +901,7 @@ f"> */ slascl_("G", &c__0, &c__0, &orgnrm, &c_b11, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; } /* Book-keeping and setting up some constants. */ @@ -993,7 +994,7 @@ f"> */ st], &work[vt + st1], n, &work[nwork], n, &b[st + b_dim1], ldb, &work[nwork], info); if (*info != 0) { - return 0; + return; } slacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n); @@ -1009,7 +1010,7 @@ f"> */ st1], &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } bxst = bx + st1; slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, & @@ -1020,7 +1021,7 @@ f"> */ work[givnum + st1], &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } } st = i__ + 1; @@ -1072,7 +1073,7 @@ f"> */ &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[ iwk], info); if (*info != 0) { - return 0; + return; } } /* L80: */ @@ -1085,7 +1086,7 @@ f"> */ slascl_("G", &c__0, &c__0, &orgnrm, &c_b11, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; /* End of SLALSD */ diff --git a/lapack-netlib/SRC/slamrg.c b/lapack-netlib/SRC/slamrg.c index fd2d04ae92..75d51e4c49 100644 --- a/lapack-netlib/SRC/slamrg.c +++ b/lapack-netlib/SRC/slamrg.c @@ -609,7 +609,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slamrg_(integer *n1, integer *n2, real *a, integer * +/* Subroutine */ void slamrg_(integer *n1, integer *n2, real *a, integer * strd1, integer *strd2, integer *index) { /* System generated locals */ @@ -682,7 +682,7 @@ f"> */ } } - return 0; + return; /* End of SLAMRG */ diff --git a/lapack-netlib/SRC/slamswlq.c b/lapack-netlib/SRC/slamswlq.c index d03ca872b1..9a7591fcbd 100644 --- a/lapack-netlib/SRC/slamswlq.c +++ b/lapack-netlib/SRC/slamswlq.c @@ -713,7 +713,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slamswlq_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void slamswlq_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, real *a, integer *lda, real * t, integer *ldt, real *c__, integer *ldc, real *work, integer *lwork, integer *info) @@ -731,7 +731,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int sgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemlqt_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *), stpmlqt_(char *, char *, integer *, integer *, integer *, integer *, integer *, @@ -799,10 +799,10 @@ static integer c__0 = 0; i__1 = -(*info); xerbla_("SLAMSWLQ", &i__1, (ftnlen)8); work[1] = (real) lw; - return 0; + return; } else if (lquery) { work[1] = (real) lw; - return 0; + return; } /* Quick return if possible */ @@ -810,7 +810,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -818,7 +818,7 @@ static integer c__0 = 0; if (*nb <= *k || *nb >= f2cmax(i__1,*k)) { sgemlqt_(side, trans, m, n, k, mb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && tran) { @@ -961,7 +961,7 @@ static integer c__0 = 0; } work[1] = (real) lw; - return 0; + return; /* End of SLAMSWLQ */ diff --git a/lapack-netlib/SRC/slamtsqr.c b/lapack-netlib/SRC/slamtsqr.c index 319d00ff76..99b4acb93b 100644 --- a/lapack-netlib/SRC/slamtsqr.c +++ b/lapack-netlib/SRC/slamtsqr.c @@ -706,7 +706,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slamtsqr_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void slamtsqr_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, real *a, integer *lda, real * t, integer *ldt, real *c__, integer *ldc, real *work, integer *lwork, integer *info) @@ -724,7 +724,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int sgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemqrt_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *), stpmqrt_(char *, char *, integer *, integer *, integer *, integer *, integer *, @@ -797,9 +797,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("SLAMTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -807,7 +807,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -815,7 +815,7 @@ static integer c__0 = 0; if (*mb <= *k || *mb >= f2cmax(i__1,*k)) { sgemqrt_(side, trans, m, n, k, nb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && notran) { @@ -959,7 +959,7 @@ static integer c__0 = 0; } work[1] = (real) lw; - return 0; + return; /* End of SLAMTSQR */ diff --git a/lapack-netlib/SRC/slangb.c b/lapack-netlib/SRC/slangb.c index 25629c9d57..d2b1fb86f4 100644 --- a/lapack-netlib/SRC/slangb.c +++ b/lapack-netlib/SRC/slangb.c @@ -646,13 +646,13 @@ real slangb_(char *norm, integer *n, integer *kl, integer *ku, real *ab, /* Local variables */ real temp; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k, l; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slange.c b/lapack-netlib/SRC/slange.c index 077683ec63..5090e6fc67 100644 --- a/lapack-netlib/SRC/slange.c +++ b/lapack-netlib/SRC/slange.c @@ -637,13 +637,13 @@ real slange_(char *norm, integer *m, integer *n, real *a, integer *lda, real * /* Local variables */ real temp; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slangt.c b/lapack-netlib/SRC/slangt.c index 67bc4d2210..27a3f0b08c 100644 --- a/lapack-netlib/SRC/slangt.c +++ b/lapack-netlib/SRC/slangt.c @@ -633,7 +633,7 @@ real slangt_(char *norm, integer *n, real *dl, real *d__, real *du) extern logical lsame_(char *, char *); real anorm; extern logical sisnan_(real *); - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum; diff --git a/lapack-netlib/SRC/slanhs.c b/lapack-netlib/SRC/slanhs.c index 57296075bf..75142b2555 100644 --- a/lapack-netlib/SRC/slanhs.c +++ b/lapack-netlib/SRC/slanhs.c @@ -629,13 +629,13 @@ real slanhs_(char *norm, integer *n, real *a, integer *lda, real *work) real ret_val, r__1; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slansb.c b/lapack-netlib/SRC/slansb.c index 6546b3651d..1c2fbb5084 100644 --- a/lapack-netlib/SRC/slansb.c +++ b/lapack-netlib/SRC/slansb.c @@ -651,13 +651,13 @@ real slansb_(char *norm, char *uplo, integer *n, integer *k, real *ab, /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, l; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slansf.c b/lapack-netlib/SRC/slansf.c index 9176b704d6..014aaefdbe 100644 --- a/lapack-netlib/SRC/slansf.c +++ b/lapack-netlib/SRC/slansf.c @@ -738,7 +738,7 @@ real slansf_(char *norm, char *transr, char *uplo, integer *n, real *a, real * integer n1; real aa; extern logical sisnan_(real *); - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); integer lda, ifm, noe, ilu; diff --git a/lapack-netlib/SRC/slansp.c b/lapack-netlib/SRC/slansp.c index a7397ac0e7..b752781145 100644 --- a/lapack-netlib/SRC/slansp.c +++ b/lapack-netlib/SRC/slansp.c @@ -636,13 +636,13 @@ real slansp_(char *norm, char *uplo, integer *n, real *ap, real *work) /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slanst.c b/lapack-netlib/SRC/slanst.c index d5126ce4d9..8e7a24bd32 100644 --- a/lapack-netlib/SRC/slanst.c +++ b/lapack-netlib/SRC/slanst.c @@ -626,7 +626,7 @@ real slanst_(char *norm, integer *n, real *d__, real *e) extern logical lsame_(char *, char *); real anorm; extern logical sisnan_(real *); - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum; diff --git a/lapack-netlib/SRC/slansy.c b/lapack-netlib/SRC/slansy.c index 7f57d0fc92..d5cbbf21ba 100644 --- a/lapack-netlib/SRC/slansy.c +++ b/lapack-netlib/SRC/slansy.c @@ -645,13 +645,13 @@ real slansy_(char *norm, char *uplo, integer *n, real *a, integer *lda, real * /* Local variables */ real absa; - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slantb.c b/lapack-netlib/SRC/slantb.c index d4034a8ba2..0b488ce052 100644 --- a/lapack-netlib/SRC/slantb.c +++ b/lapack-netlib/SRC/slantb.c @@ -661,14 +661,14 @@ real slantb_(char *norm, char *uplo, char *diag, integer *n, integer *k, real real ret_val, r__1; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, l; logical udiag; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slantp.c b/lapack-netlib/SRC/slantp.c index f1290db1a0..6d6187eca4 100644 --- a/lapack-netlib/SRC/slantp.c +++ b/lapack-netlib/SRC/slantp.c @@ -646,14 +646,14 @@ real slantp_(char *norm, char *uplo, char *diag, integer *n, real *ap, real * real ret_val, r__1; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j, k; logical udiag; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slantr.c b/lapack-netlib/SRC/slantr.c index 734c447636..4d74a52744 100644 --- a/lapack-netlib/SRC/slantr.c +++ b/lapack-netlib/SRC/slantr.c @@ -662,14 +662,14 @@ real slantr_(char *norm, char *uplo, char *diag, integer *m, integer *n, real real ret_val, r__1; /* Local variables */ - extern /* Subroutine */ int scombssq_(real *, real *); + extern /* Subroutine */ void scombssq_(real *, real *); integer i__, j; logical udiag; extern logical lsame_(char *, char *); real value; extern logical sisnan_(real *); real colssq[2]; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real sum, ssq[2]; diff --git a/lapack-netlib/SRC/slanv2.c b/lapack-netlib/SRC/slanv2.c index 60578fa2c3..b26f84fcbe 100644 --- a/lapack-netlib/SRC/slanv2.c +++ b/lapack-netlib/SRC/slanv2.c @@ -643,7 +643,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slanv2_(real *a, real *b, real *c__, real *d__, real * +/* Subroutine */ void slanv2_(real *a, real *b, real *c__, real *d__, real * rt1r, real *rt1i, real *rt2r, real *rt2i, real *cs, real *sn) { /* System generated locals */ @@ -823,7 +823,7 @@ f"> */ *rt1i = sqrt((abs(*b))) * sqrt((abs(*c__))); *rt2i = -(*rt1i); } - return 0; + return; /* End of SLANV2 */ diff --git a/lapack-netlib/SRC/slaorhr_col_getrfnp.c b/lapack-netlib/SRC/slaorhr_col_getrfnp.c index 32f434a4fe..13bb846543 100644 --- a/lapack-netlib/SRC/slaorhr_col_getrfnp.c +++ b/lapack-netlib/SRC/slaorhr_col_getrfnp.c @@ -662,17 +662,17 @@ _col_getrfnp.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int slaorhr_col_getrfnp_(integer *m, integer *n, real *a, +/* Subroutine */ void slaorhr_col_getrfnp_(integer *m, integer *n, real *a, integer *lda, real *d__, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ - extern /* Subroutine */ int slaorhr_col_getrfnp2_(integer *, integer *, + extern /* Subroutine */ void slaorhr_col_getrfnp2_(integer *, integer *, real *, integer *, real *, integer *); integer j, iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, @@ -712,13 +712,13 @@ _col_getrfnp.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAORHR_COL_GETRFNP", &i__1, (ftnlen)19); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -769,7 +769,7 @@ _col_getrfnp.f"> */ } } } - return 0; + return; /* End of SLAORHR_COL_GETRFNP */ diff --git a/lapack-netlib/SRC/slaorhr_col_getrfnp2.c b/lapack-netlib/SRC/slaorhr_col_getrfnp2.c index 8b8dd73925..5569c9d92d 100644 --- a/lapack-netlib/SRC/slaorhr_col_getrfnp2.c +++ b/lapack-netlib/SRC/slaorhr_col_getrfnp2.c @@ -682,7 +682,7 @@ _col_getrfnp2.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int slaorhr_col_getrfnp2_(integer *m, integer *n, real *a, +/* Subroutine */ void slaorhr_col_getrfnp2_(integer *m, integer *n, real *a, integer *lda, real *d__, integer *info) { /* System generated locals */ @@ -691,12 +691,12 @@ _col_getrfnp2.f"> */ /* Local variables */ integer i__, iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real sfmin; integer n1, n2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); extern real slamch_(char *); @@ -732,13 +732,13 @@ _col_getrfnp2.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAORHR_COL_GETRFNP2", &i__1, (ftnlen)20); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } if (*m == 1) { @@ -822,7 +822,7 @@ _col_getrfnp2.f"> */ lda, &d__[n1 + 1], &iinfo); } - return 0; + return; /* End of SLAORHR_COL_GETRFNP2 */ diff --git a/lapack-netlib/SRC/slapll.c b/lapack-netlib/SRC/slapll.c index 8a3386f468..aa284a9541 100644 --- a/lapack-netlib/SRC/slapll.c +++ b/lapack-netlib/SRC/slapll.c @@ -611,7 +611,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slapll_(integer *n, real *x, integer *incx, real *y, +/* Subroutine */ void slapll_(integer *n, real *x, integer *incx, real *y, integer *incy, real *ssmin) { /* System generated locals */ @@ -619,13 +619,13 @@ f"> */ /* Local variables */ extern real sdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; real c__, ssmax; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); real a11, a12, a22; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); real tau; @@ -648,7 +648,7 @@ f"> */ /* Function Body */ if (*n <= 1) { *ssmin = 0.f; - return 0; + return; } /* Compute the QR factorization of the N-by-2 matrix ( X Y ) */ @@ -670,7 +670,7 @@ f"> */ slas2_(&a11, &a12, &a22, ssmin, &ssmax); - return 0; + return; /* End of SLAPLL */ diff --git a/lapack-netlib/SRC/slapmr.c b/lapack-netlib/SRC/slapmr.c index a26bde8ff4..da542a823b 100644 --- a/lapack-netlib/SRC/slapmr.c +++ b/lapack-netlib/SRC/slapmr.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slapmr_(logical *forwrd, integer *m, integer *n, real *x, +/* Subroutine */ void slapmr_(logical *forwrd, integer *m, integer *n, real *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*m <= 1) { - return 0; + return; } i__1 = *m; @@ -727,7 +727,7 @@ f"> */ } - return 0; + return; /* End of ZLAPMT */ diff --git a/lapack-netlib/SRC/slapmt.c b/lapack-netlib/SRC/slapmt.c index c1fe84d6db..cdbbd9dadf 100644 --- a/lapack-netlib/SRC/slapmt.c +++ b/lapack-netlib/SRC/slapmt.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slapmt_(logical *forwrd, integer *m, integer *n, real *x, +/* Subroutine */ void slapmt_(logical *forwrd, integer *m, integer *n, real *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *n; @@ -726,7 +726,7 @@ f"> */ } - return 0; + return; /* End of SLAPMT */ diff --git a/lapack-netlib/SRC/slaqgb.c b/lapack-netlib/SRC/slaqgb.c index e8ae4a8c26..f01adbe319 100644 --- a/lapack-netlib/SRC/slaqgb.c +++ b/lapack-netlib/SRC/slaqgb.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup realGBauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaqgb_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void slaqgb_(integer *m, integer *n, integer *kl, integer *ku, real *ab, integer *ldab, real *r__, real *c__, real *rowcnd, real * colcnd, real *amax, char *equed) { @@ -702,7 +702,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -781,7 +781,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of SLAQGB */ diff --git a/lapack-netlib/SRC/slaqge.c b/lapack-netlib/SRC/slaqge.c index ed6807ae18..4af7d4c8f4 100644 --- a/lapack-netlib/SRC/slaqge.c +++ b/lapack-netlib/SRC/slaqge.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup realGEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaqge_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void slaqge_(integer *m, integer *n, real *a, integer *lda, real *r__, real *c__, real *rowcnd, real *colcnd, real *amax, char * equed) { @@ -685,7 +685,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -749,7 +749,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of SLAQGE */ diff --git a/lapack-netlib/SRC/slaqp2.c b/lapack-netlib/SRC/slaqp2.c index c51a6863fa..3547e505d6 100644 --- a/lapack-netlib/SRC/slaqp2.c +++ b/lapack-netlib/SRC/slaqp2.c @@ -661,7 +661,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int slaqp2_(integer *m, integer *n, integer *offset, real *a, +/* Subroutine */ void slaqp2_(integer *m, integer *n, integer *offset, real *a, integer *lda, integer *jpvt, real *tau, real *vn1, real *vn2, real * work) { @@ -675,14 +675,14 @@ f"> */ integer i__, j; real tol3z; integer offpi; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer itemp; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer mn; extern real slamch_(char *); - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); extern integer isamax_(integer *, real *, integer *); real aii; @@ -796,7 +796,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of SLAQP2 */ diff --git a/lapack-netlib/SRC/slaqps.c b/lapack-netlib/SRC/slaqps.c index 3fdc4929d6..b9093db399 100644 --- a/lapack-netlib/SRC/slaqps.c +++ b/lapack-netlib/SRC/slaqps.c @@ -694,7 +694,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int slaqps_(integer *m, integer *n, integer *offset, integer +/* Subroutine */ void slaqps_(integer *m, integer *n, integer *offset, integer *nb, integer *kb, real *a, integer *lda, integer *jpvt, real *tau, real *vn1, real *vn2, real *auxv, real *f, integer *ldf) { @@ -707,15 +707,15 @@ f"> */ extern real snrm2_(integer *, real *, integer *); integer j, k; real tol3z; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer itemp; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer *); integer rk; extern real slamch_(char *); - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); integer lsticc; extern integer isamax_(integer *, real *, integer *); @@ -914,7 +914,7 @@ f"> */ goto L40; } - return 0; + return; /* End of SLAQPS */ diff --git a/lapack-netlib/SRC/slaqr0.c b/lapack-netlib/SRC/slaqr0.c index 98f000c77d..419e6ae71e 100644 --- a/lapack-netlib/SRC/slaqr0.c +++ b/lapack-netlib/SRC/slaqr0.c @@ -777,7 +777,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaqr0_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void slaqr0_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real * wi, integer *iloz, integer *ihiz, real *z__, integer *ldz, real *work, integer *lwork, integer *info) @@ -793,16 +793,16 @@ f"> */ real zdum[1] /* was [1][1] */; integer kacc22, i__, k, itmax, nsmax, nwmax, kwtop; real aa, bb; - extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slanv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *, real *); real cc; - extern /* Subroutine */ int slaqr3_(logical *, logical *, integer *, + extern /* Subroutine */ void slaqr3_(logical *, logical *, integer *, integer *, integer *, integer *, real *, integer *, integer *, integer *, real *, integer *, integer *, integer *, real *, real * , real *, integer *, integer *, real *, integer *, integer *, real *, integer *, real *, integer *); real dd; - extern /* Subroutine */ int slaqr4_(logical *, logical *, integer *, + extern /* Subroutine */ void slaqr4_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , integer *, real *, integer *, real *, integer *, integer *), slaqr5_(logical *, logical *, integer *, integer *, integer *, @@ -820,7 +820,7 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); char jbcmpz[2]; - extern /* Subroutine */ int slahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void slahqr_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); @@ -869,7 +869,7 @@ f"> */ if (*n == 0) { work[1] = 1.f; - return 0; + return; } if (*n <= 15) { @@ -951,7 +951,7 @@ f"> */ if (*lwork == -1) { work[1] = (real) lwkopt; - return 0; + return; } /* ==== SLAHQR/SLAQR0 crossover point ==== */ @@ -1338,6 +1338,6 @@ f"> */ /* ==== End of SLAQR0 ==== */ - return 0; + return; } /* slaqr0_ */ diff --git a/lapack-netlib/SRC/slaqr1.c b/lapack-netlib/SRC/slaqr1.c index ee8a9ca0a0..56f487c005 100644 --- a/lapack-netlib/SRC/slaqr1.c +++ b/lapack-netlib/SRC/slaqr1.c @@ -631,7 +631,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaqr1_(integer *n, real *h__, integer *ldh, real *sr1, +/* Subroutine */ void slaqr1_(integer *n, real *h__, integer *ldh, real *sr1, real *si1, real *sr2, real *si2, real *v) { /* System generated locals */ @@ -661,7 +661,7 @@ f"> */ /* Function Body */ if (*n != 2 && *n != 3) { - return 0; + return; } if (*n == 2) { @@ -697,6 +697,6 @@ f"> */ sr2) + h21s * h__[(h_dim1 << 1) + 3]; } } - return 0; + return; } /* slaqr1_ */ diff --git a/lapack-netlib/SRC/slaqr2.c b/lapack-netlib/SRC/slaqr2.c index 55731cbd74..93ab490d8c 100644 --- a/lapack-netlib/SRC/slaqr2.c +++ b/lapack-netlib/SRC/slaqr2.c @@ -795,7 +795,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaqr2_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void slaqr2_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, real *h__, integer *ldh, integer *iloz, integer *ihiz, real *z__, integer *ldz, integer *ns, integer *nd, real *sr, real *si, real *v, integer *ldv, integer *nh, @@ -812,33 +812,33 @@ f"> */ integer kend, kcol, info, ifst, ilst, ltop, krow, i__, j, k; real s; logical bulge; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *), sgemm_( char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer infqr; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer kwtop; real aa, bb, cc; - extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slanv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *, real *); real dd, cs; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real sn; integer jw; extern real slamch_(char *); - extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real + extern /* Subroutine */ void sgehrd_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); real safmin, safmax; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *), slahqr_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer * , real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_( char *, integer *, integer *, real *, real *, real *, integer *); logical sorted; - extern /* Subroutine */ int strexc_(char *, integer *, real *, integer *, + extern /* Subroutine */ void strexc_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *), sormhr_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -911,7 +911,7 @@ f"> */ if (*lwork == -1) { work[1] = (real) lwkopt; - return 0; + return; } /* ==== Nothing to do ... */ @@ -920,11 +920,11 @@ f"> */ *nd = 0; work[1] = 1.f; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -966,7 +966,7 @@ f"> */ } } work[1] = 1.f; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1307,6 +1307,6 @@ f"> */ /* ==== End of SLAQR2 ==== */ - return 0; + return; } /* slaqr2_ */ diff --git a/lapack-netlib/SRC/slaqr3.c b/lapack-netlib/SRC/slaqr3.c index ca495c9c6a..80fb05bccd 100644 --- a/lapack-netlib/SRC/slaqr3.c +++ b/lapack-netlib/SRC/slaqr3.c @@ -793,7 +793,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaqr3_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void slaqr3_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, real *h__, integer *ldh, integer *iloz, integer *ihiz, real *z__, integer *ldz, integer *ns, integer *nd, real *sr, real *si, real *v, integer *ldv, integer *nh, @@ -810,39 +810,39 @@ f"> */ integer kend, kcol, info, nmin, ifst, ilst, ltop, krow, i__, j, k; real s; logical bulge; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *), sgemm_( char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer infqr; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer kwtop; real aa, bb, cc; - extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slanv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *, real *); real dd; - extern /* Subroutine */ int slaqr4_(logical *, logical *, integer *, + extern /* Subroutine */ void slaqr4_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , integer *, real *, integer *, real *, integer *, integer *); real cs; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real sn; integer jw; extern real slamch_(char *); - extern /* Subroutine */ int sgehrd_(integer *, integer *, integer *, real + extern /* Subroutine */ void sgehrd_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); real safmin, safmax; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *), slahqr_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer * , real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_( char *, integer *, integer *, real *, real *, real *, integer *); logical sorted; - extern /* Subroutine */ int strexc_(char *, integer *, real *, integer *, + extern /* Subroutine */ void strexc_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *), sormhr_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -924,7 +924,7 @@ f"> */ if (*lwork == -1) { work[1] = (real) lwkopt; - return 0; + return; } /* ==== Nothing to do ... */ @@ -933,11 +933,11 @@ f"> */ *nd = 0; work[1] = 1.f; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -979,7 +979,7 @@ f"> */ } } work[1] = 1.f; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1328,6 +1328,6 @@ f"> */ /* ==== End of SLAQR3 ==== */ - return 0; + return; } /* slaqr3_ */ diff --git a/lapack-netlib/SRC/slaqr4.c b/lapack-netlib/SRC/slaqr4.c index 3a08431856..8b54e58de1 100644 --- a/lapack-netlib/SRC/slaqr4.c +++ b/lapack-netlib/SRC/slaqr4.c @@ -786,7 +786,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaqr4_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void slaqr4_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, real *h__, integer *ldh, real *wr, real * wi, integer *iloz, integer *ihiz, real *z__, integer *ldz, real *work, integer *lwork, integer *info) @@ -802,10 +802,10 @@ f"> */ real zdum[1] /* was [1][1] */; integer kacc22, i__, k, itmax, nsmax, nwmax, kwtop; real aa, bb; - extern /* Subroutine */ int slanv2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slanv2_(real *, real *, real *, real *, real * , real *, real *, real *, real *, real *); real cc, dd; - extern /* Subroutine */ int slaqr2_(logical *, logical *, integer *, + extern /* Subroutine */ void slaqr2_(logical *, logical *, integer *, integer *, integer *, integer *, real *, integer *, integer *, integer *, real *, integer *, integer *, integer *, real *, real * , real *, integer *, integer *, real *, integer *, integer *, @@ -824,7 +824,7 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); char jbcmpz[2]; - extern /* Subroutine */ int slahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void slahqr_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); @@ -874,7 +874,7 @@ f"> */ if (*n == 0) { work[1] = 1.f; - return 0; + return; } if (*n <= 15) { @@ -956,7 +956,7 @@ f"> */ if (*lwork == -1) { work[1] = (real) lwkopt; - return 0; + return; } /* ==== SLAHQR/SLAQR0 crossover point ==== */ @@ -1336,6 +1336,6 @@ f"> */ /* ==== End of SLAQR4 ==== */ - return 0; + return; } /* slaqr4_ */ diff --git a/lapack-netlib/SRC/slaqr5.c b/lapack-netlib/SRC/slaqr5.c index bba9f571ce..ceaf406ca2 100644 --- a/lapack-netlib/SRC/slaqr5.c +++ b/lapack-netlib/SRC/slaqr5.c @@ -780,7 +780,7 @@ f"> */ /* > ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaqr5_(logical *wantt, logical *wantz, integer *kacc22, +/* Subroutine */ void slaqr5_(logical *wantt, logical *wantz, integer *kacc22, integer *n, integer *ktop, integer *kbot, integer *nshfts, real *sr, real *si, real *h__, integer *ldh, integer *iloz, integer *ihiz, real *z__, integer *ldz, real *v, integer *ldv, real *u, integer *ldu, @@ -802,19 +802,19 @@ f"> */ real alpha; logical accum; integer ndcol, incol; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer krcol, nbmps, i2, k1, i4; - extern /* Subroutine */ int slaqr1_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slaqr1_(integer *, real *, integer *, real *, real *, real *, real *, real *); real h11, h12, h21, h22; integer m22; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ns, nu; extern real slamch_(char *); real vt[3], safmin, safmax; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); @@ -858,14 +858,14 @@ f"> */ /* Function Body */ if (*nshfts < 2) { - return 0; + return; } /* ==== If the active block is empty or 1-by-1, then there */ /* . is nothing to do. ==== */ if (*ktop >= *kbot) { - return 0; + return; } /* ==== Shuffle shifts into pairs of real shifts and pairs */ @@ -1526,6 +1526,6 @@ f"> */ /* ==== End of SLAQR5 ==== */ - return 0; + return; } /* slaqr5_ */ diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index b9bae9376b..b10e597542 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -286,8 +286,8 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * .. * .. Local Scalars .. REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, - $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, - $ ULP + $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2, + $ T3, TST1, TST2, ULP INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, $ M, M22, MBOT, MTOP, NBMPS, NDCOL, @@ -447,11 +447,12 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -464,11 +465,12 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, ELSE JBOT = KBOT END IF + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -522,18 +524,20 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * IF( ACCUM ) THEN KMS = K - INCOL + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 50 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 50 CONTINUE ELSE IF( WANTZ ) THEN + T1 = V( 1, M22 ) + T2 = T1*V( 2, M22 ) DO 60 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 60 CONTINUE END IF END IF @@ -554,10 +558,13 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * . Mth bulge. Exploit fact that first two elements * . of row are actually zero. ==== * - REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) - H( K+3, K ) = -REFSUM - H( K+3, K+1 ) = -REFSUM*V( 2, M ) - H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM*T1 + H( K+3, K+1 ) = -REFSUM*T2 + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*T3 * * ==== Calculate reflection to move * . Mth bulge one step. ==== @@ -593,11 +600,13 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ VT ) ALPHA = VT( 1 ) CALL SLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) - REFSUM = VT( 1 )*( H( K+1, K )+VT( 2 )* - $ H( K+2, K ) ) + T1 = VT( 1 ) + T2 = T1*VT( 2 ) + T3 = T2*VT( 3 ) + REFSUM = H( K+1, K )+VT( 2 )*H( K+2, K ) * - IF( ABS( H( K+2, K )-REFSUM*VT( 2 ) )+ - $ ABS( REFSUM*VT( 3 ) ).GT.ULP* + IF( ABS( H( K+2, K )-REFSUM*T2 )+ + $ ABS( REFSUM*T3 ).GT.ULP* $ ( ABS( H( K, K ) )+ABS( H( K+1, $ K+1 ) )+ABS( H( K+2, K+2 ) ) ) ) THEN * @@ -615,7 +624,7 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * . Replace the old reflector with * . the new one. ==== * - H( K+1, K ) = H( K+1, K ) - REFSUM + H( K+1, K ) = H( K+1, K ) - REFSUM*T1 H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) @@ -631,22 +640,25 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* - $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 ) + $ + V( 3, M )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -706,12 +718,15 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J ) + $ + V( 3, M )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -729,12 +744,15 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -745,12 +763,15 @@ SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/slaqsb.c b/lapack-netlib/SRC/slaqsb.c index 3943f0894a..d6aa050dd4 100644 --- a/lapack-netlib/SRC/slaqsb.c +++ b/lapack-netlib/SRC/slaqsb.c @@ -649,7 +649,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaqsb_(char *uplo, integer *n, integer *kd, real *ab, +/* Subroutine */ void slaqsb_(char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -683,7 +683,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -738,7 +738,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of SLAQSB */ diff --git a/lapack-netlib/SRC/slaqsp.c b/lapack-netlib/SRC/slaqsp.c index db1b815990..b49337226f 100644 --- a/lapack-netlib/SRC/slaqsp.c +++ b/lapack-netlib/SRC/slaqsp.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaqsp_(char *uplo, integer *n, real *ap, real *s, real * +/* Subroutine */ void slaqsp_(char *uplo, integer *n, real *ap, real *s, real * scond, real *amax, char *equed) { /* System generated locals */ @@ -669,7 +669,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -722,7 +722,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of SLAQSP */ diff --git a/lapack-netlib/SRC/slaqsy.c b/lapack-netlib/SRC/slaqsy.c index 5028c317bb..e81c12778a 100644 --- a/lapack-netlib/SRC/slaqsy.c +++ b/lapack-netlib/SRC/slaqsy.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup realSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaqsy_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void slaqsy_(char *uplo, integer *n, real *a, integer *lda, real *s, real *scond, real *amax, char *equed) { /* System generated locals */ @@ -676,7 +676,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -725,7 +725,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of SLAQSY */ diff --git a/lapack-netlib/SRC/slaqtr.c b/lapack-netlib/SRC/slaqtr.c index 4030ba4a53..86c83abfdc 100644 --- a/lapack-netlib/SRC/slaqtr.c +++ b/lapack-netlib/SRC/slaqtr.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaqtr_(logical *ltran, logical *lreal, integer *n, real +/* Subroutine */ void slaqtr_(logical *ltran, logical *lreal, integer *n, real *t, integer *ldt, real *b, real *w, real *scale, real *x, real *work, integer *info) { @@ -698,14 +698,14 @@ f"> */ real xmax, d__[4] /* was [2][2] */; integer i__, j, k; real v[4] /* was [2][2] */, z__; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer jnext; extern real sasum_(integer *, real *, integer *); integer j1, j2; real sminw; integer n1, n2; real xnorm; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), slaln2_(logical *, integer *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *); @@ -714,7 +714,7 @@ f"> */ real *, integer *, real *); real bignum; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int sladiv_(real *, real *, real *, real *, real * + extern /* Subroutine */ void sladiv_(real *, real *, real *, real *, real * , real *); logical notran; real smlnum, rec, eps, tjj, tmp; @@ -746,7 +746,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -1385,7 +1385,7 @@ f"> */ } - return 0; + return; /* End of SLAQTR */ diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f index 15913be88c..2e06f9d42c 100644 --- a/lapack-netlib/SRC/slaqz0.f +++ b/lapack-netlib/SRC/slaqz0.f @@ -318,7 +318,8 @@ RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 ) * Local scalars - REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP + REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP, + $ BNORM, BTOL INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, $ NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM, @@ -330,7 +331,7 @@ RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * External Functions EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD, $ SLARTG, SROT - REAL, EXTERNAL :: SLAMCH + REAL, EXTERNAL :: SLAMCH, SLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -482,6 +483,9 @@ RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, ULP = SLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( REAL( N )/ULP ) + BNORM = SLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 3*( IHI-ILO+1 ) @@ -558,15 +562,8 @@ RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMP = ZERO - IF( K .LT. ISTOP ) THEN - TEMP = TEMP+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMP = TEMP+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it @@ -678,7 +675,7 @@ RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, NS = MIN( NSHIFTS, ISTOP-ISTART2 ) NS = MIN( NS, N_UNDEFLATED ) - SHIFTPOS = ISTOP-N_DEFLATED-N_UNDEFLATED+1 + SHIFTPOS = ISTOP-N_UNDEFLATED+1 * * Shuffle shifts to put double shifts in front * This ensures that we don't split up a double shift diff --git a/lapack-netlib/SRC/slar1v.c b/lapack-netlib/SRC/slar1v.c index e88999ee79..32073a50fc 100644 --- a/lapack-netlib/SRC/slar1v.c +++ b/lapack-netlib/SRC/slar1v.c @@ -738,7 +738,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slar1v_(integer *n, integer *b1, integer *bn, real * +/* Subroutine */ void slar1v_(integer *n, integer *b1, integer *bn, real * lambda, real *d__, real *l, real *ld, real *lld, real *pivmin, real * gaptol, real *z__, logical *wantnc, integer *negcnt, real *ztz, real * mingma, integer *r__, integer *isuppz, real *nrminv, real *resid, @@ -1030,7 +1030,7 @@ f"> */ *rqcorr = *mingma * tmp; - return 0; + return; /* End of SLAR1V */ diff --git a/lapack-netlib/SRC/slar2v.c b/lapack-netlib/SRC/slar2v.c index 6fea0cfebe..a442494748 100644 --- a/lapack-netlib/SRC/slar2v.c +++ b/lapack-netlib/SRC/slar2v.c @@ -620,7 +620,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slar2v_(integer *n, real *x, real *y, real *z__, integer +/* Subroutine */ void slar2v_(integer *n, real *x, real *y, real *z__, integer *incx, real *c__, real *s, integer *incc) { /* System generated locals */ @@ -677,6 +677,6 @@ f"> */ /* End of SLAR2V */ - return 0; + return; } /* slar2v_ */ diff --git a/lapack-netlib/SRC/slarf.c b/lapack-netlib/SRC/slarf.c index 3eaab3c524..344c7b18f6 100644 --- a/lapack-netlib/SRC/slarf.c +++ b/lapack-netlib/SRC/slarf.c @@ -639,7 +639,7 @@ static integer c__1 = 1; /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarf_(char *side, integer *m, integer *n, real *v, +/* Subroutine */ void slarf_(char *side, integer *m, integer *n, real *v, integer *incv, real *tau, real *c__, integer *ldc, real *work) { /* System generated locals */ @@ -647,12 +647,12 @@ static integer c__1 = 1; real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__; extern logical lsame_(char *, char *); integer lastc; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lastv; logical applyleft; @@ -743,7 +743,7 @@ static integer c__1 = 1; c_offset], ldc); } } - return 0; + return; /* End of SLARF */ diff --git a/lapack-netlib/SRC/slarfb.c b/lapack-netlib/SRC/slarfb.c index dd528878b5..c6af561392 100644 --- a/lapack-netlib/SRC/slarfb.c +++ b/lapack-netlib/SRC/slarfb.c @@ -711,7 +711,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void slarfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, real *v, integer *ldv, real *t, integer *ldt, real *c__, integer *ldc, real *work, integer * ldwork) @@ -723,7 +723,7 @@ f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), strmm_(char *, char *, char *, @@ -759,7 +759,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (lsame_(trans, "N")) { @@ -1329,7 +1329,7 @@ f"> */ } } - return 0; + return; /* End of SLARFB */ diff --git a/lapack-netlib/SRC/slarfb_gett.c b/lapack-netlib/SRC/slarfb_gett.c index 2a245a32a3..f42da13ca2 100644 --- a/lapack-netlib/SRC/slarfb_gett.c +++ b/lapack-netlib/SRC/slarfb_gett.c @@ -904,7 +904,7 @@ gett.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarfb_gett_(char *ident, integer *m, integer *n, +/* Subroutine */ void slarfb_gett_(char *ident, integer *m, integer *n, integer *k, real *t, integer *ldt, real *a, integer *lda, real *b, integer *ldb, real *work, integer *ldwork) { @@ -915,7 +915,7 @@ gett.f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), strmm_(char *, char *, char *, @@ -950,7 +950,7 @@ gett.f"> */ /* Function Body */ if (*m < 0 || *n <= 0 || *k == 0 || *k > *n) { - return 0; + return; } lnotident = ! lsame_(ident, "I"); @@ -1131,7 +1131,7 @@ gett.f"> */ } } - return 0; + return; /* End of SLARFB_GETT */ diff --git a/lapack-netlib/SRC/slarfg.c b/lapack-netlib/SRC/slarfg.c index e7784fe8a7..ed3c80fa06 100644 --- a/lapack-netlib/SRC/slarfg.c +++ b/lapack-netlib/SRC/slarfg.c @@ -615,7 +615,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarfg_(integer *n, real *alpha, real *x, integer *incx, +/* Subroutine */ void slarfg_(integer *n, real *alpha, real *x, integer *incx, real *tau) { /* System generated locals */ @@ -626,7 +626,7 @@ f"> */ real beta; extern real snrm2_(integer *, real *, integer *); integer j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real xnorm; extern real slapy2_(real *, real *), slamch_(char *); real safmin, rsafmn; @@ -648,7 +648,7 @@ f"> */ /* Function Body */ if (*n <= 1) { *tau = 0.f; - return 0; + return; } i__1 = *n - 1; @@ -704,7 +704,7 @@ f"> */ *alpha = beta; } - return 0; + return; /* End of SLARFG */ diff --git a/lapack-netlib/SRC/slarfgp.c b/lapack-netlib/SRC/slarfgp.c index 08ea906cc7..7baef79996 100644 --- a/lapack-netlib/SRC/slarfgp.c +++ b/lapack-netlib/SRC/slarfgp.c @@ -613,7 +613,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarfgp_(integer *n, real *alpha, real *x, integer *incx, +/* Subroutine */ void slarfgp_(integer *n, real *alpha, real *x, integer *incx, real *tau) { /* System generated locals */ @@ -624,7 +624,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ real beta; extern real snrm2_(integer *, real *, integer *); integer j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real savealpha, xnorm; extern real slapy2_(real *, real *), slamch_(char *); real bignum, smlnum; @@ -646,7 +646,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Function Body */ if (*n <= 0) { *tau = 0.f; - return 0; + return; } i__1 = *n - 1; @@ -752,7 +752,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *alpha = beta; } - return 0; + return; /* End of SLARFGP */ diff --git a/lapack-netlib/SRC/slarft.c b/lapack-netlib/SRC/slarft.c index 4985943406..c498d809f6 100644 --- a/lapack-netlib/SRC/slarft.c +++ b/lapack-netlib/SRC/slarft.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarft_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void slarft_(char *direct, char *storev, integer *n, integer * k, real *v, integer *ldv, real *tau, real *t, integer *ldt) { /* System generated locals */ @@ -687,13 +687,13 @@ f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lastv; - extern /* Subroutine */ int strmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); integer prevlastv; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -718,7 +718,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } if (lsame_(direct, "F")) { @@ -879,7 +879,7 @@ f"> */ } } } - return 0; + return; /* End of SLARFT */ diff --git a/lapack-netlib/SRC/slarfx.c b/lapack-netlib/SRC/slarfx.c index 785f3951ab..681c939a33 100644 --- a/lapack-netlib/SRC/slarfx.c +++ b/lapack-netlib/SRC/slarfx.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarfx_(char *side, integer *m, integer *n, real *v, +/* Subroutine */ void slarfx_(char *side, integer *m, integer *n, real *v, real *tau, real *c__, integer *ldc, real *work) { /* System generated locals */ @@ -643,7 +643,7 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); real t1, t2, t3, t4, t5, t6, t7, t8, t9, v1, v2, v3, v4, v5, v6, v7, v8, v9, t10, v10, sum; @@ -667,7 +667,7 @@ f"> */ /* Function Body */ if (*tau == 0.f) { - return 0; + return; } if (lsame_(side, "L")) { @@ -1279,7 +1279,7 @@ f"> */ goto L410; } L410: - return 0; + return; /* End of SLARFX */ diff --git a/lapack-netlib/SRC/slarfy.c b/lapack-netlib/SRC/slarfy.c index cc5da7a249..c1eb373496 100644 --- a/lapack-netlib/SRC/slarfy.c +++ b/lapack-netlib/SRC/slarfy.c @@ -620,7 +620,7 @@ static integer c__1 = 1; /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarfy_(char *uplo, integer *n, real *v, integer *incv, +/* Subroutine */ void slarfy_(char *uplo, integer *n, real *v, integer *incv, real *tau, real *c__, integer *ldc, real *work) { /* System generated locals */ @@ -629,10 +629,10 @@ static integer c__1 = 1; /* Local variables */ extern real sdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real alpha; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); @@ -655,7 +655,7 @@ static integer c__1 = 1; /* Function Body */ if (*tau == 0.f) { - return 0; + return; } /* Form w:= C * v */ @@ -671,7 +671,7 @@ static integer c__1 = 1; r__1 = -(*tau); ssyr2_(uplo, n, &r__1, &v[1], incv, &work[1], &c__1, &c__[c_offset], ldc); - return 0; + return; /* End of SLARFY */ diff --git a/lapack-netlib/SRC/slargv.c b/lapack-netlib/SRC/slargv.c index 07a203af25..9c24fb24a9 100644 --- a/lapack-netlib/SRC/slargv.c +++ b/lapack-netlib/SRC/slargv.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slargv_(integer *n, real *x, integer *incx, real *y, +/* Subroutine */ void slargv_(integer *n, real *x, integer *incx, real *y, integer *incy, real *c__, integer *incc) { /* System generated locals */ @@ -673,7 +673,7 @@ f"> */ ix += *incx; /* L10: */ } - return 0; + return; /* End of SLARGV */ diff --git a/lapack-netlib/SRC/slarmm.c b/lapack-netlib/SRC/slarmm.c new file mode 100644 index 0000000000..95114e2f15 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.c @@ -0,0 +1,605 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLARMM */ + +/* Definition: */ +/* =========== */ + +/* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) */ + +/* REAL ANORM, BNORM, CNORM */ + +/* > \par Purpose: */ +/* ======= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLARMM returns a factor s in (0, 1] such that the linear updates */ +/* > */ +/* > (s * C) - A * (s * B) and (s * C) - (s * A) * B */ +/* > */ +/* > cannot overflow, where A, B, and C are matrices of conforming */ +/* > dimensions. */ +/* > */ +/* > This is an auxiliary routine so there is no argument checking. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========= */ + +/* > \param[in] ANORM */ +/* > \verbatim */ +/* > ANORM is REAL */ +/* > The infinity norm of A. ANORM >= 0. */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] BNORM */ +/* > \verbatim */ +/* > BNORM is REAL */ +/* > The infinity norm of B. BNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL */ +/* > The infinity norm of C. CNORM >= 0. */ +/* > \endverbatim */ +/* > */ +/* > */ +/* ===================================================================== */ +/* > References: */ +/* > C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */ +/* > Robust Solution of Triangular Linear Systems. In: International */ +/* > Conference on Parallel Processing and Applied Mathematics, pages */ +/* > 68--78. Springer, 2017. */ +/* > */ +/* > \ingroup OTHERauxiliary */ +/* ===================================================================== */ +real slarmm_(real *anorm, real *bnorm, real *cnorm) +{ + /* System generated locals */ + real ret_val; + + /* Local variables */ + extern real slamch_(char *); + real bignum, smlnum; + + + +/* Determine machine dependent parameters to control overflow. */ + + smlnum = slamch_("Safe minimum") / slamch_("Precision"); + bignum = 1.f / smlnum / 4.f; + +/* Compute a scale factor. */ + + ret_val = 1.f; + if (*bnorm <= 1.f) { + if (*anorm * *bnorm > bignum - *cnorm) { + ret_val = .5f; + } + } else { + if (*anorm > (bignum - *cnorm) / *bnorm) { + ret_val = .5f / *bnorm; + } + } + return ret_val; + +/* ==== End of SLARMM ==== */ + +} /* slarmm_ */ + diff --git a/lapack-netlib/SRC/slarmm.f b/lapack-netlib/SRC/slarmm.f new file mode 100644 index 0000000000..643dd67487 --- /dev/null +++ b/lapack-netlib/SRC/slarmm.f @@ -0,0 +1,99 @@ +*> \brief \b SLARMM +* +* Definition: +* =========== +* +* REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) +* +* .. Scalar Arguments .. +* REAL ANORM, BNORM, CNORM +* .. +* +*> \par Purpose: +* ======= +*> +*> \verbatim +*> +*> SLARMM returns a factor s in (0, 1] such that the linear updates +*> +*> (s * C) - A * (s * B) and (s * C) - (s * A) * B +*> +*> cannot overflow, where A, B, and C are matrices of conforming +*> dimensions. +*> +*> This is an auxiliary routine so there is no argument checking. +*> \endverbatim +* +* Arguments: +* ========= +* +*> \param[in] ANORM +*> \verbatim +*> ANORM is REAL +*> The infinity norm of A. ANORM >= 0. +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] BNORM +*> \verbatim +*> BNORM is REAL +*> The infinity norm of B. BNORM >= 0. +*> \endverbatim +*> +*> \param[in] CNORM +*> \verbatim +*> CNORM is REAL +*> The infinity norm of C. CNORM >= 0. +*> \endverbatim +*> +*> +* ===================================================================== +*> References: +*> C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for +*> Robust Solution of Triangular Linear Systems. In: International +*> Conference on Parallel Processing and Applied Mathematics, pages +*> 68--78. Springer, 2017. +*> +*> \ingroup OTHERauxiliary +* ===================================================================== + + REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) + IMPLICIT NONE +* .. Scalar Arguments .. + REAL ANORM, BNORM, CNORM +* .. Parameters .. + REAL ONE, HALF, FOUR + PARAMETER ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 ) +* .. +* .. Local Scalars .. + REAL BIGNUM, SMLNUM +* .. +* .. External Functions .. + REAL SLAMCH + EXTERNAL SLAMCH +* .. +* .. Executable Statements .. +* +* +* Determine machine dependent parameters to control overflow. +* + SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) + BIGNUM = ( ONE / SMLNUM ) / FOUR +* +* Compute a scale factor. +* + SLARMM = ONE + IF( BNORM .LE. ONE ) THEN + IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN + SLARMM = HALF + END IF + ELSE + IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN + SLARMM = HALF / BNORM + END IF + END IF + RETURN +* +* ==== End of SLARMM ==== +* + END diff --git a/lapack-netlib/SRC/slarnv.c b/lapack-netlib/SRC/slarnv.c index cb29a357ab..38e6d01dc5 100644 --- a/lapack-netlib/SRC/slarnv.c +++ b/lapack-netlib/SRC/slarnv.c @@ -606,7 +606,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarnv_(integer *idist, integer *iseed, integer *n, real +/* Subroutine */ void slarnv_(integer *idist, integer *iseed, integer *n, real *x) { /* System generated locals */ @@ -616,7 +616,7 @@ f"> */ integer i__; real u[128]; integer il, iv, il2; - extern /* Subroutine */ int slaruv_(integer *, integer *, real *); + extern /* Subroutine */ void slaruv_(integer *, integer *, real *); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -680,7 +680,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of SLARNV */ diff --git a/lapack-netlib/SRC/slarra.c b/lapack-netlib/SRC/slarra.c index f1e4fb0513..22699f073b 100644 --- a/lapack-netlib/SRC/slarra.c +++ b/lapack-netlib/SRC/slarra.c @@ -644,7 +644,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarra_(integer *n, real *d__, real *e, real *e2, real * +/* Subroutine */ void slarra_(integer *n, real *d__, real *e, real *e2, real * spltol, real *tnrm, integer *nsplit, integer *isplit, integer *info) { /* System generated locals */ @@ -678,7 +678,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Compute splitting points */ @@ -713,7 +713,7 @@ f"> */ } } isplit[*nsplit] = *n; - return 0; + return; /* End of SLARRA */ diff --git a/lapack-netlib/SRC/slarra.f b/lapack-netlib/SRC/slarra.f index be81b0f691..2e27383e13 100644 --- a/lapack-netlib/SRC/slarra.f +++ b/lapack-netlib/SRC/slarra.f @@ -164,6 +164,7 @@ SUBROUTINE SLARRA( N, D, E, E2, SPLTOL, TNRM, * .. Executable Statements .. * INFO = 0 + NSPLIT = 1 * * Quick return if possible * @@ -172,7 +173,6 @@ SUBROUTINE SLARRA( N, D, E, E2, SPLTOL, TNRM, END IF * * Compute splitting points - NSPLIT = 1 IF(SPLTOL.LT.ZERO) THEN * Criterion based on absolute off-diagonal value TMP1 = ABS(SPLTOL)* TNRM diff --git a/lapack-netlib/SRC/slarrb.c b/lapack-netlib/SRC/slarrb.c index b8139fda6f..b9f080a205 100644 --- a/lapack-netlib/SRC/slarrb.c +++ b/lapack-netlib/SRC/slarrb.c @@ -703,7 +703,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarrb_(integer *n, real *d__, real *lld, integer * +/* Subroutine */ void slarrb_(integer *n, real *d__, real *lld, integer * ifirst, integer *ilast, real *rtol1, real *rtol2, integer *offset, real *w, real *wgap, real *werr, real *work, integer *iwork, real * pivmin, real *spdiam, integer *twist, integer *info) @@ -750,7 +750,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } maxitr = (integer) ((log(*spdiam + *pivmin) - log(*pivmin)) / log(2.f)) + @@ -932,7 +932,7 @@ f"> */ wgap[ii - 1] = f2cmax(r__1,r__2); /* L111: */ } - return 0; + return; /* End of SLARRB */ diff --git a/lapack-netlib/SRC/slarrc.c b/lapack-netlib/SRC/slarrc.c index f774dc4310..a3bd9fbe1d 100644 --- a/lapack-netlib/SRC/slarrc.c +++ b/lapack-netlib/SRC/slarrc.c @@ -645,7 +645,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarrc_(char *jobt, integer *n, real *vl, real *vu, real +/* Subroutine */ void slarrc_(char *jobt, integer *n, real *vl, real *vu, real *d__, real *e, real *pivmin, integer *eigcnt, integer *lcnt, integer * rcnt, integer *info) { @@ -679,7 +679,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } *lcnt = 0; @@ -752,7 +752,7 @@ f"> */ } } *eigcnt = *rcnt - *lcnt; - return 0; + return; /* end of SLARRC */ diff --git a/lapack-netlib/SRC/slarrc.f b/lapack-netlib/SRC/slarrc.f index 060051539f..2100d1b3dc 100644 --- a/lapack-netlib/SRC/slarrc.f +++ b/lapack-netlib/SRC/slarrc.f @@ -167,6 +167,9 @@ SUBROUTINE SLARRC( JOBT, N, VL, VU, D, E, PIVMIN, * .. Executable Statements .. * INFO = 0 + LCNT = 0 + RCNT = 0 + EIGCNT = 0 * * Quick return if possible * @@ -174,9 +177,6 @@ SUBROUTINE SLARRC( JOBT, N, VL, VU, D, E, PIVMIN, RETURN END IF * - LCNT = 0 - RCNT = 0 - EIGCNT = 0 MATT = LSAME( JOBT, 'T' ) diff --git a/lapack-netlib/SRC/slarrd.c b/lapack-netlib/SRC/slarrd.c index 672091d4d3..c5e703439c 100644 --- a/lapack-netlib/SRC/slarrd.c +++ b/lapack-netlib/SRC/slarrd.c @@ -844,7 +844,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarrd_(char *range, char *order, integer *n, real *vl, +/* Subroutine */ void slarrd_(char *range, char *order, integer *n, real *vl, real *vu, integer *il, integer *iu, real *gers, real *reltol, real * d__, real *e, real *e2, real *pivmin, integer *nsplit, integer * isplit, integer *m, real *w, real *werr, real *wl, real *wu, integer * @@ -871,7 +871,7 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer idiscu; - extern /* Subroutine */ int slaebz_(integer *, integer *, integer *, + extern /* Subroutine */ void slaebz_(integer *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, real *, real *, real *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); @@ -912,7 +912,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Decode RANGE */ @@ -947,7 +947,7 @@ f"> */ } if (*info != 0) { - return 0; + return; } /* Initialize error flags */ *info = 0; @@ -956,7 +956,7 @@ f"> */ /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Simplification: */ if (irange == 3 && *il == 1 && *iu == *n) { @@ -977,7 +977,7 @@ f"> */ iblock[1] = 1; indexw[1] = 1; } - return 0; + return; } /* NB is the minimum vector length for vector bisection, or 0 */ /* if only scalar is to be done. */ @@ -1042,7 +1042,7 @@ f"> */ , &iout, &iwork[1], &w[1], &iblock[1], &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* On exit, output intervals may not be ordered by ascending negcount */ if (iwork[6] == *iu) { @@ -1064,7 +1064,7 @@ f"> */ /* and [WUL, WU] contains a value with negcount NWU. */ if (nwl < 0 || nwl >= *n || nwu < 1 || nwu > *n) { *info = 4; - return 0; + return; } } else if (irange == 2) { *wl = *vl; @@ -1202,7 +1202,7 @@ f"> */ w[*m + 1], &iblock[*m + 1], &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } nwl += iwork[1]; @@ -1217,7 +1217,7 @@ f"> */ &w[*m + 1], &iblock[*m + 1], &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* Copy eigenvalues into W and IBLOCK */ @@ -1410,7 +1410,7 @@ f"> */ if (toofew) { *info += 2; } - return 0; + return; /* End of SLARRD */ diff --git a/lapack-netlib/SRC/slarrd.f b/lapack-netlib/SRC/slarrd.f index 7df8e95fca..21405baa6a 100644 --- a/lapack-netlib/SRC/slarrd.f +++ b/lapack-netlib/SRC/slarrd.f @@ -381,6 +381,7 @@ SUBROUTINE SLARRD( RANGE, ORDER, N, VL, VU, IL, IU, GERS, * .. Executable Statements .. * INFO = 0 + M = 0 * * Quick return if possible * @@ -424,14 +425,9 @@ SUBROUTINE SLARRD( RANGE, ORDER, N, VL, VU, IL, IU, GERS, END IF * Initialize error flags - INFO = 0 NCNVRG = .FALSE. TOOFEW = .FALSE. -* Quick return if possible - M = 0 - IF( N.EQ.0 ) RETURN - * Simplification: IF( IRANGE.EQ.INDRNG .AND. IL.EQ.1 .AND. IU.EQ.N ) IRANGE = 1 diff --git a/lapack-netlib/SRC/slarre.c b/lapack-netlib/SRC/slarre.c index 57a91249d3..ae44ff4d20 100644 --- a/lapack-netlib/SRC/slarre.c +++ b/lapack-netlib/SRC/slarre.c @@ -817,7 +817,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarre_(char *range, integer *n, real *vl, real *vu, +/* Subroutine */ void slarre_(char *range, integer *n, real *vl, real *vu, integer *il, integer *iu, real *d__, real *e, real *e2, real *rtol1, real *rtol2, real *spltol, integer *nsplit, integer *isplit, integer * m, real *w, real *werr, real *wgap, integer *iblock, integer *indexw, @@ -840,10 +840,10 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; logical norep; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real s1, s2; - extern /* Subroutine */ int slasq2_(integer *, real *, integer *); + extern /* Subroutine */ void slasq2_(integer *, real *, integer *); integer mb; real gl; integer in, mm; @@ -855,11 +855,11 @@ f"> */ extern real slamch_(char *); integer wbegin; real safmin, spdiam; - extern /* Subroutine */ int slarra_(integer *, real *, real *, real *, + extern /* Subroutine */ void slarra_(integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *); logical usedqd; real clwdth, isleft; - extern /* Subroutine */ int slarrb_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarrb_(integer *, real *, real *, integer *, integer *, real *, real *, integer *, real *, real *, real *, real *, integer *, real *, real *, integer *, integer *), slarrc_( char *, integer *, real *, real *, real *, real *, real *, @@ -871,7 +871,7 @@ f"> */ integer *, real *, real *, real *, real *, real *, real *, real *, real *, integer *); real isrght, bsrtol, dpivot; - extern /* Subroutine */ int slarnv_(integer *, integer *, integer *, real + extern /* Subroutine */ void slarnv_(integer *, integer *, integer *, real *); integer cnt; real eps, tau, tmp, rtl; @@ -908,7 +908,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Decode RANGE */ @@ -947,7 +947,7 @@ f"> */ } /* store the shift for the initial RRR, which is zero in this case */ e[1] = 0.f; - return 0; + return; } /* General case: tridiagonal matrix of order > 1 */ @@ -1012,7 +1012,7 @@ f"> */ vl, vu, &iblock[1], &indexw[1], &work[1], &iwork[1], &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* Make sure that the entries M+1 to N in W, WERR, IBLOCK, INDEXW are 0 */ i__1 = *n; @@ -1121,7 +1121,7 @@ f"> */ rtl, &tmp, &tmp1, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* Computing MAX */ r__2 = gl, r__3 = tmp - tmp1 - eps * 100.f * (r__1 = tmp - tmp1, @@ -1131,7 +1131,7 @@ f"> */ rtl, &tmp, &tmp1, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* Computing MIN */ r__2 = gu, r__3 = tmp + tmp1 + eps * 100.f * (r__1 = tmp + tmp1, @@ -1318,7 +1318,7 @@ f"> */ /* if the program reaches this point, no base representation could be */ /* found in MAXTRY iterations. */ *info = 2; - return 0; + return; L83: /* At this point, we have found an initial base representation */ /* T - SIGMA I = L D L^T with not too much element growth. */ @@ -1384,7 +1384,7 @@ f"> */ iinfo); if (iinfo != 0) { *info = -4; - return 0; + return; } /* SLARRB computes all gaps correctly except for the last one */ /* Record distance to VU/GU */ @@ -1427,14 +1427,14 @@ f"> */ /* and should be changed. The index is in IWORK(1) and the */ /* gap is in WORK(N+1) */ *info = -5; - return 0; + return; } else { /* Test that all eigenvalues are positive as expected */ i__2 = in; for (i__ = 1; i__ <= i__2; ++i__) { if (work[i__] < 0.f) { *info = -6; - return 0; + return; } /* L149: */ } @@ -1484,7 +1484,7 @@ f"> */ ; } - return 0; + return; /* end of SLARRE */ diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f index 34dd71fd9d..2e34ca5a6a 100644 --- a/lapack-netlib/SRC/slarre.f +++ b/lapack-netlib/SRC/slarre.f @@ -367,6 +367,8 @@ SUBROUTINE SLARRE( RANGE, N, VL, VU, IL, IU, D, E, E2, * INFO = 0 + NSPLIT = 0 + M = 0 * * Quick return if possible * @@ -384,8 +386,6 @@ SUBROUTINE SLARRE( RANGE, N, VL, VU, IL, IU, D, E, E2, IRANGE = INDRNG END IF - M = 0 - * Get machine constants SAFMIN = SLAMCH( 'S' ) EPS = SLAMCH( 'P' ) diff --git a/lapack-netlib/SRC/slarrf.c b/lapack-netlib/SRC/slarrf.c index 12411b9200..a169b50e60 100644 --- a/lapack-netlib/SRC/slarrf.c +++ b/lapack-netlib/SRC/slarrf.c @@ -704,7 +704,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarrf_(integer *n, real *d__, real *l, real *ld, +/* Subroutine */ void slarrf_(integer *n, real *d__, real *l, real *ld, integer *clstrt, integer *clend, real *w, real *wgap, real *werr, real *spdiam, real *clgapl, real *clgapr, real *pivmin, real *sigma, real *dplus, real *lplus, real *work, integer *info) @@ -722,7 +722,7 @@ f"> */ integer i__; real s, avgap, ldmax, rdmax; integer shift; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real bestshift, smlgrowth; logical dorrr1; @@ -765,7 +765,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } fact = 2.f; @@ -1010,7 +1010,7 @@ f"> */ goto L5; } else { *info = 1; - return 0; + return; } } L100: @@ -1021,7 +1021,7 @@ f"> */ i__1 = *n - 1; scopy_(&i__1, &work[*n + 1], &c__1, &lplus[1], &c__1); } - return 0; + return; /* End of SLARRF */ diff --git a/lapack-netlib/SRC/slarrj.c b/lapack-netlib/SRC/slarrj.c index ad2896a105..18bfc4b4a4 100644 --- a/lapack-netlib/SRC/slarrj.c +++ b/lapack-netlib/SRC/slarrj.c @@ -675,7 +675,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarrj_(integer *n, real *d__, real *e2, integer *ifirst, +/* Subroutine */ void slarrj_(integer *n, real *d__, real *e2, integer *ifirst, integer *ilast, real *rtol, integer *offset, real *w, real *werr, real *work, integer *iwork, real *pivmin, real *spdiam, integer *info) { @@ -717,7 +717,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } maxitr = (integer) ((log(*spdiam + *pivmin) - log(*pivmin)) / log(2.f)) + @@ -909,7 +909,7 @@ f"> */ /* L110: */ } - return 0; + return; /* End of SLARRJ */ diff --git a/lapack-netlib/SRC/slarrk.c b/lapack-netlib/SRC/slarrk.c index 67794ca2f7..1dbbcb4050 100644 --- a/lapack-netlib/SRC/slarrk.c +++ b/lapack-netlib/SRC/slarrk.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slarrk_(integer *n, integer *iw, real *gl, real *gu, +/* Subroutine */ void slarrk_(integer *n, integer *iw, real *gl, real *gu, real *d__, real *e2, real *pivmin, real *reltol, real *w, real *werr, integer *info) { @@ -692,7 +692,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *info = 0; - return 0; + return; } /* Get machine constants */ @@ -761,7 +761,7 @@ f"> */ *w = (left + right) * .5f; *werr = (r__1 = right - left, abs(r__1)) * .5f; - return 0; + return; /* End of SLARRK */ diff --git a/lapack-netlib/SRC/slarrr.c b/lapack-netlib/SRC/slarrr.c index 1bd03571c0..e77ee1d8b0 100644 --- a/lapack-netlib/SRC/slarrr.c +++ b/lapack-netlib/SRC/slarrr.c @@ -604,7 +604,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarrr_(integer *n, real *d__, real *e, integer *info) +/* Subroutine */ void slarrr_(integer *n, real *d__, real *e, integer *info) { /* System generated locals */ integer i__1; @@ -639,7 +639,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *info = 0; - return 0; + return; } /* As a default, do NOT go for relative-accuracy preserving computations. */ @@ -697,7 +697,7 @@ f"> */ L11: if (yesrel) { *info = 0; - return 0; + return; } else { } @@ -716,7 +716,7 @@ f"> */ /* to the computed eigenvectors (and the support) */ - return 0; + return; /* END OF SLARRR */ diff --git a/lapack-netlib/SRC/slarrv.c b/lapack-netlib/SRC/slarrv.c index 9f4cecb1fc..e922e882b3 100644 --- a/lapack-netlib/SRC/slarrv.c +++ b/lapack-netlib/SRC/slarrv.c @@ -804,7 +804,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int slarrv_(integer *n, real *vl, real *vu, real *d__, real * +/* Subroutine */ void slarrv_(integer *n, real *vl, real *vu, real *d__, real * l, real *pivmin, integer *isplit, integer *m, integer *dol, integer * dou, real *minrgp, real *rtol1, real *rtol2, real *w, real *werr, real *wgap, integer *iblock, integer *indexw, real *gers, real *z__, @@ -829,15 +829,15 @@ f"> */ real sigma; integer iinfo, iindr; real resid; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical eskip; real right; integer nclus, zfrom; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real rqtol; integer iindc1, iindc2, miniwsize; - extern /* Subroutine */ int slar1v_(integer *, integer *, integer *, real + extern /* Subroutine */ void slar1v_(integer *, integer *, integer *, real *, real *, real *, real *, real *, real *, real *, real *, logical *, integer *, real *, real *, integer *, integer *, real * , real *, real *, real *); @@ -861,7 +861,7 @@ f"> */ logical usedbs; integer iindwk, offset; real gaptol; - extern /* Subroutine */ int slarrb_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarrb_(integer *, real *, real *, integer *, integer *, real *, real *, integer *, real *, real *, real *, real *, integer *, real *, real *, integer *, integer *), slarrf_( integer *, real *, real *, real *, integer *, integer *, real *, @@ -875,7 +875,7 @@ f"> */ logical tryrqc; integer isupmx; real rqcorr; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real gap, eps, tau, tol, tmp; integer zto; @@ -913,7 +913,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0 || *m <= 0) { - return 0; + return; } /* The first N entries of WORK are reserved for the eigenvalues */ @@ -1063,7 +1063,7 @@ f"> */ /* This is a crude protection against infinitely deep trees */ if (ndepth > *m) { *info = -2; - return 0; + return; } /* breadth first processing of the current level of the representation */ /* tree: OLDNCL = number of clusters on current level */ @@ -1143,7 +1143,7 @@ f"> */ iindwk], pivmin, &spdiam, &in, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* We also recompute the extremal gaps. W holds all eigenvalues */ /* of the unshifted matrix and must be used for computation */ @@ -1310,7 +1310,7 @@ f"> */ iwork[k] = newlst; } else { *info = -2; - return 0; + return; } } else { @@ -1414,7 +1414,7 @@ f"> */ iindwk], pivmin, &spdiam, &itmp1, &iinfo); if (iinfo != 0) { *info = -3; - return 0; + return; } lambda = work[windex]; /* Reset twist index from inaccurate LAMBDA to */ @@ -1509,7 +1509,7 @@ f"> */ goto L120; } else { *info = 5; - return 0; + return; } } else { stp2ii = FALSE_; @@ -1602,7 +1602,7 @@ f"> */ ; } - return 0; + return; /* End of SLARRV */ diff --git a/lapack-netlib/SRC/slarscl2.c b/lapack-netlib/SRC/slarscl2.c index 0d61685a6f..1403ea61dd 100644 --- a/lapack-netlib/SRC/slarscl2.c +++ b/lapack-netlib/SRC/slarscl2.c @@ -599,7 +599,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slarscl2_(integer *m, integer *n, real *d__, real *x, +/* Subroutine */ void slarscl2_(integer *m, integer *n, real *d__, real *x, integer *ldx) { /* System generated locals */ @@ -632,6 +632,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__ + j * x_dim1] /= d__[i__]; } } - return 0; + return; } /* slarscl2_ */ diff --git a/lapack-netlib/SRC/slarscl2.f b/lapack-netlib/SRC/slarscl2.f index 5726f12cd5..c7b77c9083 100644 --- a/lapack-netlib/SRC/slarscl2.f +++ b/lapack-netlib/SRC/slarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> SLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> SLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is REAL array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slartg.c b/lapack-netlib/SRC/slartg.c index 94d9b506b0..d3ba18ab75 100644 --- a/lapack-netlib/SRC/slartg.c +++ b/lapack-netlib/SRC/slartg.c @@ -608,7 +608,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slartg_(real *f, real *g, real *cs, real *sn, real *r__) +/* Subroutine */ void slartg_(real *f, real *g, real *cs, real *sn, real *r__) { /* System generated locals */ integer i__1; @@ -721,7 +721,7 @@ f"> */ *r__ = -(*r__); } } - return 0; + return; /* End of SLARTG */ diff --git a/lapack-netlib/SRC/slartg.f90 b/lapack-netlib/SRC/slartg.f90 index a9af1aa8d5..8a5a8f26a3 100644 --- a/lapack-netlib/SRC/slartg.f90 +++ b/lapack-netlib/SRC/slartg.f90 @@ -35,7 +35,7 @@ !> square root of the sum of squares. !> !> This version is discontinuous in R at F = 0 but it returns the same -!> C and S as SLARTG for complex inputs (F,0) and (G,0). +!> C and S as CLARTG for complex inputs (F,0) and (G,0). !> !> This is a more accurate version of the BLAS1 routine SROTG, !> with the following other differences: @@ -45,8 +45,6 @@ !> floating point operations (saves work in SBDSQR when !> there are zeros on the diagonal). !> -!> If F exceeds G in magnitude, C will be positive. -!> !> Below, wp=>sp stands for single precision from LA_CONSTANTS module. !> \endverbatim ! @@ -112,7 +110,7 @@ subroutine SLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>sp, zero=>szero, half=>shalf, one=>sone, & - rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax + safmin=>ssafmin, safmax=>ssafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -123,11 +121,15 @@ subroutine SLARTG( f, g, c, s, r ) real(wp) :: c, f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, fs, g1, gs, p, u, uu + real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax ! .. ! .. Intrinsic Functions .. intrinsic :: abs, sign, sqrt ! .. +! .. Constants .. + rtmin = sqrt( safmin ) + rtmax = sqrt( safmax/2 ) +! .. ! .. Executable Statements .. ! f1 = abs( f ) @@ -143,20 +145,18 @@ subroutine SLARTG( f, g, c, s, r ) else if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then d = sqrt( f*f + g*g ) - p = one / d - c = f1*p - s = g*sign( p, f ) + c = f1 / d r = sign( d, f ) + s = g / r else u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - fs = f*uu - gs = g*uu + fs = f / u + gs = g / u d = sqrt( fs*fs + gs*gs ) - p = one / d - c = abs( fs )*p - s = gs*sign( p, f ) - r = sign( d, f )*u + c = abs( fs ) / d + r = sign( d, f ) + s = gs / r + r = r*u end if return end subroutine diff --git a/lapack-netlib/SRC/slartgp.c b/lapack-netlib/SRC/slartgp.c index e0753cfc34..51995ea7f9 100644 --- a/lapack-netlib/SRC/slartgp.c +++ b/lapack-netlib/SRC/slartgp.c @@ -610,7 +610,7 @@ static real c_b6 = 1.f; /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slartgp_(real *f, real *g, real *cs, real *sn, real *r__) +/* Subroutine */ void slartgp_(real *f, real *g, real *cs, real *sn, real *r__) { /* System generated locals */ integer i__1; @@ -723,7 +723,7 @@ static real c_b6 = 1.f; *r__ = -(*r__); } } - return 0; + return; /* End of SLARTG */ diff --git a/lapack-netlib/SRC/slartgs.c b/lapack-netlib/SRC/slartgs.c index 482b2ccd17..a7517ec1fd 100644 --- a/lapack-netlib/SRC/slartgs.c +++ b/lapack-netlib/SRC/slartgs.c @@ -602,13 +602,13 @@ he bidiagonal SVD problem. */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slartgs_(real *x, real *y, real *sigma, real *cs, real * +/* Subroutine */ void slartgs_(real *x, real *y, real *sigma, real *cs, real * sn) { real r__, s, w, z__; extern real slamch_(char *); real thresh; - extern /* Subroutine */ int slartgp_(real *, real *, real *, real *, real + extern /* Subroutine */ void slartgp_(real *, real *, real *, real *, real *); @@ -657,7 +657,7 @@ he bidiagonal SVD problem. */ slartgp_(&w, &z__, sn, cs, &r__); - return 0; + return; /* End SLARTGS */ diff --git a/lapack-netlib/SRC/slartv.c b/lapack-netlib/SRC/slartv.c index f5c707c1f3..75306c8885 100644 --- a/lapack-netlib/SRC/slartv.c +++ b/lapack-netlib/SRC/slartv.c @@ -618,7 +618,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slartv_(integer *n, real *x, integer *incx, real *y, +/* Subroutine */ void slartv_(integer *n, real *x, integer *incx, real *y, integer *incy, real *c__, real *s, integer *incc) { /* System generated locals */ @@ -659,7 +659,7 @@ f"> */ ic += *incc; /* L10: */ } - return 0; + return; /* End of SLARTV */ diff --git a/lapack-netlib/SRC/slaruv.c b/lapack-netlib/SRC/slaruv.c index 80b1d85569..b7101f0fd3 100644 --- a/lapack-netlib/SRC/slaruv.c +++ b/lapack-netlib/SRC/slaruv.c @@ -604,7 +604,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaruv_(integer *iseed, integer *n, real *x) +/* Subroutine */ void slaruv_(integer *iseed, integer *n, real *x) { /* Initialized data */ @@ -727,7 +727,7 @@ f"> */ iseed[2] = it2; iseed[3] = it3; iseed[4] = it4; - return 0; + return; /* End of SLARUV */ diff --git a/lapack-netlib/SRC/slarz.c b/lapack-netlib/SRC/slarz.c index 4ea8f907c7..d089fdb3e2 100644 --- a/lapack-netlib/SRC/slarz.c +++ b/lapack-netlib/SRC/slarz.c @@ -659,7 +659,7 @@ static real c_b5 = 1.f; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarz_(char *side, integer *m, integer *n, integer *l, +/* Subroutine */ void slarz_(char *side, integer *m, integer *n, integer *l, real *v, integer *incv, real *tau, real *c__, integer *ldc, real * work) { @@ -668,10 +668,10 @@ static real c_b5 = 1.f; real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *); @@ -752,7 +752,7 @@ static real c_b5 = 1.f; } - return 0; + return; /* End of SLARZ */ diff --git a/lapack-netlib/SRC/slarzb.c b/lapack-netlib/SRC/slarzb.c index 02ff0bbe5e..b0e29fd26a 100644 --- a/lapack-netlib/SRC/slarzb.c +++ b/lapack-netlib/SRC/slarzb.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarzb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void slarzb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, real *v, integer *ldv, real *t, integer *ldt, real *c__, integer *ldc, real * work, integer *ldwork) @@ -709,12 +709,13 @@ f"> */ /* Local variables */ integer info, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); char transt[1]; @@ -745,7 +746,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } /* Check for currently supported options */ @@ -759,7 +760,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("SLARZB", &i__1, (ftnlen)6); - return 0; + return; } if (lsame_(trans, "N")) { @@ -865,7 +866,7 @@ f"> */ } - return 0; + return; /* End of SLARZB */ diff --git a/lapack-netlib/SRC/slarzt.c b/lapack-netlib/SRC/slarzt.c index 5459dc8a2c..3c4ba98e5c 100644 --- a/lapack-netlib/SRC/slarzt.c +++ b/lapack-netlib/SRC/slarzt.c @@ -699,7 +699,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slarzt_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void slarzt_(char *direct, char *storev, integer *n, integer * k, real *v, integer *ldv, real *tau, real *t, integer *ldt) { /* System generated locals */ @@ -709,9 +709,10 @@ f"> */ /* Local variables */ integer info, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, - integer *, real *, integer *), xerbla_( char *, integer *, ftnlen); + integer *, real *, integer *); + extern int xerbla_( char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -744,7 +745,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("SLARZT", &i__1, (ftnlen)6); - return 0; + return; } for (i__ = *k; i__ >= 1; --i__) { @@ -782,7 +783,7 @@ f"> */ } /* L20: */ } - return 0; + return; /* End of SLARZT */ diff --git a/lapack-netlib/SRC/slas2.c b/lapack-netlib/SRC/slas2.c index 1d750d26b3..d43738d6d3 100644 --- a/lapack-netlib/SRC/slas2.c +++ b/lapack-netlib/SRC/slas2.c @@ -618,7 +618,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slas2_(real *f, real *g, real *h__, real *ssmin, real * +/* Subroutine */ void slas2_(real *f, real *g, real *h__, real *ssmin, real * ssmax) { /* System generated locals */ @@ -686,7 +686,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } } - return 0; + return; /* End of SLAS2 */ diff --git a/lapack-netlib/SRC/slascl.c b/lapack-netlib/SRC/slascl.c index aecbca67f7..82c46209a7 100644 --- a/lapack-netlib/SRC/slascl.c +++ b/lapack-netlib/SRC/slascl.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slascl_(char *type__, integer *kl, integer *ku, real * +/* Subroutine */ void slascl_(char *type__, integer *kl, integer *ku, real * cfrom, real *cto, integer *m, integer *n, real *a, integer *lda, integer *info) { @@ -744,13 +744,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASCL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } /* Get machine parameters */ @@ -913,7 +913,7 @@ f"> */ goto L10; } - return 0; + return; /* End of SLASCL */ diff --git a/lapack-netlib/SRC/slascl.f b/lapack-netlib/SRC/slascl.f index e1cb420ea9..28cbd6514b 100644 --- a/lapack-netlib/SRC/slascl.f +++ b/lapack-netlib/SRC/slascl.f @@ -272,6 +272,8 @@ SUBROUTINE SLASCL( TYPE, KL, KU, CFROM, CTO, M, N, A, LDA, INFO ) ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/slascl2.c b/lapack-netlib/SRC/slascl2.c index 98f34e13cf..9e17b200c8 100644 --- a/lapack-netlib/SRC/slascl2.c +++ b/lapack-netlib/SRC/slascl2.c @@ -599,7 +599,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slascl2_(integer *m, integer *n, real *d__, real *x, +/* Subroutine */ void slascl2_(integer *m, integer *n, real *d__, real *x, integer *ldx) { /* System generated locals */ @@ -632,6 +632,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__ + j * x_dim1] *= d__[i__]; } } - return 0; + return; } /* slascl2_ */ diff --git a/lapack-netlib/SRC/slascl2.f b/lapack-netlib/SRC/slascl2.f index 07b506a8c1..5efc1cfcd2 100644 --- a/lapack-netlib/SRC/slascl2.f +++ b/lapack-netlib/SRC/slascl2.f @@ -1,4 +1,4 @@ -*> \brief \b SLASCL2 performs diagonal scaling on a vector. +*> \brief \b SLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -33,7 +33,7 @@ *> *> \verbatim *> -*> SLASCL2 performs a diagonal scaling on a vector: +*> SLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the diagonal matrix D is stored as a vector. *> @@ -65,14 +65,14 @@ *> \param[in,out] X *> \verbatim *> X is REAL array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/slasd0.c b/lapack-netlib/SRC/slasd0.c index aa553579e4..a4e9c87487 100644 --- a/lapack-netlib/SRC/slasd0.c +++ b/lapack-netlib/SRC/slasd0.c @@ -664,7 +664,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd0_(integer *n, integer *sqre, real *d__, real *e, +/* Subroutine */ void slasd0_(integer *n, integer *sqre, real *d__, real *e, real *u, integer *ldu, real *vt, integer *ldvt, integer *smlsiz, integer *iwork, real *work, integer *info) { @@ -676,11 +676,12 @@ f"> */ integer idxq, nlvl, i__, j, m; real alpha; integer inode, ndiml, idxqc, ndimr, itemp, sqrei, i1; - extern /* Subroutine */ int slasd1_(integer *, integer *, integer *, real + extern /* Subroutine */ void slasd1_(integer *, integer *, integer *, real *, real *, real *, real *, integer *, real *, integer *, integer * , integer *, real *, integer *); integer ic, lf, nd, ll, nl, nr; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slasdq_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slasdq_( char *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer *), slasdt_(integer *, integer @@ -732,7 +733,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD0", &i__1, (ftnlen)6); - return 0; + return; } /* If the input matrix is too small, call SLASDQ to find the SVD. */ @@ -740,7 +741,7 @@ f"> */ if (*n <= *smlsiz) { slasdq_("U", sqre, n, &m, n, &c__0, &d__[1], &e[1], &vt[vt_offset], ldvt, &u[u_offset], ldu, &u[u_offset], ldu, &work[1], info); - return 0; + return; } /* Set up the computation tree. */ @@ -780,7 +781,7 @@ f"> */ nlf + nlf * vt_dim1], ldvt, &u[nlf + nlf * u_dim1], ldu, &u[ nlf + nlf * u_dim1], ldu, &work[1], info); if (*info != 0) { - return 0; + return; } itemp = idxq + nlf - 2; i__2 = nl; @@ -798,7 +799,7 @@ f"> */ nrf + nrf * vt_dim1], ldvt, &u[nrf + nrf * u_dim1], ldu, &u[ nrf + nrf * u_dim1], ldu, &work[1], info); if (*info != 0) { - return 0; + return; } itemp = idxq + ic; i__2 = nr; @@ -821,7 +822,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -846,14 +847,14 @@ f"> */ /* Report the possible convergence failure. */ if (*info != 0) { - return 0; + return; } /* L40: */ } /* L50: */ } - return 0; + return; /* End of SLASD0 */ diff --git a/lapack-netlib/SRC/slasd1.c b/lapack-netlib/SRC/slasd1.c index 4ff9516bee..a781d7aeb2 100644 --- a/lapack-netlib/SRC/slasd1.c +++ b/lapack-netlib/SRC/slasd1.c @@ -720,7 +720,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd1_(integer *nl, integer *nr, integer *sqre, real * +/* Subroutine */ void slasd1_(integer *nl, integer *nr, integer *sqre, real * d__, real *alpha, real *beta, real *u, integer *ldu, real *vt, integer *ldvt, integer *idxq, integer *iwork, real *work, integer * info) @@ -731,7 +731,7 @@ f"> */ /* Local variables */ integer idxc, idxp, ldvt2, i__, k, m, n, n1, n2; - extern /* Subroutine */ int slasd2_(integer *, integer *, integer *, + extern /* Subroutine */ void slasd2_(integer *, integer *, integer *, integer *, real *, real *, real *, real *, real *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *, integer *, integer *, integer *), @@ -740,7 +740,8 @@ f"> */ , integer *, real *, integer *, integer *, integer *, real *, integer *); integer iq, iz, isigma; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slascl_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slascl_( char *, integer *, integer *, real *, real *, integer *, integer * , real *, integer *, integer *), slamrg_(integer *, integer *, real *, integer *, integer *, integer *); @@ -785,7 +786,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD1", &i__1, (ftnlen)6); - return 0; + return; } n = *nl + *nr + 1; @@ -843,7 +844,7 @@ f"> */ /* Report the possible convergence failure. */ if (*info != 0) { - return 0; + return; } /* Unscale. */ @@ -856,7 +857,7 @@ f"> */ n2 = n - k; slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]); - return 0; + return; /* End of SLASD1 */ diff --git a/lapack-netlib/SRC/slasd2.c b/lapack-netlib/SRC/slasd2.c index 0a5ee19d93..4263b3bd7c 100644 --- a/lapack-netlib/SRC/slasd2.c +++ b/lapack-netlib/SRC/slasd2.c @@ -782,7 +782,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd2_(integer *nl, integer *nr, integer *sqre, integer +/* Subroutine */ void slasd2_(integer *nl, integer *nr, integer *sqre, integer *k, real *d__, real *z__, real *alpha, real *beta, real *u, integer * ldu, real *vt, integer *ldvt, real *dsigma, real *u2, integer *ldu2, real *vt2, integer *ldvt2, integer *idxp, integer *idx, integer *idxc, @@ -795,22 +795,23 @@ f"> */ /* Local variables */ integer idxi, idxj, ctot[4]; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real c__; integer i__, j, m, n; real s; integer idxjp, jprev, k2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real z1; extern real slapy2_(real *, real *); integer ct, jp; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slamrg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slamrg_( integer *, integer *, real *, integer *, integer *, integer *); real hlftol; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real eps, tau, tol; @@ -876,7 +877,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD2", &i__1, (ftnlen)6); - return 0; + return; } nlp1 = *nl + 1; @@ -1203,7 +1204,7 @@ f"> */ /* L190: */ } - return 0; + return; /* End of SLASD2 */ diff --git a/lapack-netlib/SRC/slasd3.c b/lapack-netlib/SRC/slasd3.c index a465e2cf66..000b7198ea 100644 --- a/lapack-netlib/SRC/slasd3.c +++ b/lapack-netlib/SRC/slasd3.c @@ -739,7 +739,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd3_(integer *nl, integer *nr, integer *sqre, integer +/* Subroutine */ void slasd3_(integer *nl, integer *nr, integer *sqre, integer *k, real *d__, real *q, integer *ldq, real *dsigma, real *u, integer * ldu, real *u2, integer *ldu2, real *vt, integer *ldvt, real *vt2, integer *ldvt2, integer *idxc, integer *ctot, real *z__, integer * @@ -754,17 +754,18 @@ f"> */ real temp; extern real snrm2_(integer *, real *, integer *); integer i__, j, m, n, ctemp; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer ktemp; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamc3_(real *, real *); - extern /* Subroutine */ int slasd4_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slasd4_(integer *, integer *, real *, real *, real *, real *, real *, real *, integer *); integer jc; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slascl_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slascl_( char *, integer *, integer *, real *, real *, integer *, integer * , real *, integer *, integer *), slacpy_(char *, integer * , integer *, real *, integer *, real *, integer *); @@ -837,7 +838,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -854,7 +855,7 @@ f"> */ /* L10: */ } } - return 0; + return; } /* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can */ @@ -900,7 +901,7 @@ f"> */ /* If the zero finder fails, report the convergence failure. */ if (*info != 0) { - return 0; + return; } /* L30: */ } @@ -1003,7 +1004,7 @@ f"> */ if (*k == 2) { sgemm_("N", "N", k, &m, k, &c_b13, &q[q_offset], ldq, &vt2[vt2_offset] , ldvt2, &c_b26, &vt[vt_offset], ldvt); - return 0; + return; } ktemp = ctot[1] + 1; sgemm_("N", "N", k, &nlp1, &ktemp, &c_b13, &q[q_dim1 + 1], ldq, &vt2[ @@ -1034,7 +1035,7 @@ f"> */ vt2[ktemp + nlp2 * vt2_dim1], ldvt2, &c_b26, &vt[nlp2 * vt_dim1 + 1], ldvt); - return 0; + return; /* End of SLASD3 */ diff --git a/lapack-netlib/SRC/slasd4.c b/lapack-netlib/SRC/slasd4.c index 3c4df7de5a..265a16f137 100644 --- a/lapack-netlib/SRC/slasd4.c +++ b/lapack-netlib/SRC/slasd4.c @@ -663,7 +663,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd4_(integer *n, integer *i__, real *d__, real *z__, +/* Subroutine */ void slasd4_(integer *n, integer *i__, real *d__, real *z__, real *delta, real *rho, real *sigma, real *work, integer *info) { /* System generated locals */ @@ -680,10 +680,10 @@ f"> */ real dtisq; logical swtch; real dtnsq; - extern /* Subroutine */ int slaed6_(integer *, logical *, real *, real *, + extern /* Subroutine */ void slaed6_(integer *, logical *, real *, real *, real *, real *, real *, integer *); real delsq2; - extern /* Subroutine */ int slasd5_(integer *, real *, real *, real *, + extern /* Subroutine */ void slasd5_(integer *, real *, real *, real *, real *, real *, real *); real dd[3], dtnsq1; logical swtch3; @@ -729,11 +729,11 @@ f"> */ *sigma = sqrt(d__[1] * d__[1] + *rho * z__[1] * z__[1]); delta[1] = 1.f; work[1] = 1.f; - return 0; + return; } if (*n == 2) { slasd5_(i__, &d__[1], &z__[1], &delta[1], rho, sigma, &work[1]); - return 0; + return; } /* Compute machine epsilon */ @@ -1663,7 +1663,7 @@ f"> */ } L240: - return 0; + return; /* End of SLASD4 */ diff --git a/lapack-netlib/SRC/slasd5.c b/lapack-netlib/SRC/slasd5.c index 39bc44eb57..aabfdc73f5 100644 --- a/lapack-netlib/SRC/slasd5.c +++ b/lapack-netlib/SRC/slasd5.c @@ -626,7 +626,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd5_(integer *i__, real *d__, real *z__, real *delta, +/* Subroutine */ void slasd5_(integer *i__, real *d__, real *z__, real *delta, real *rho, real *dsigma, real *work) { /* System generated locals */ @@ -733,7 +733,7 @@ f"> */ /* DELTA( 1 ) = DELTA( 1 ) / TEMP */ /* DELTA( 2 ) = DELTA( 2 ) / TEMP */ } - return 0; + return; /* End of SLASD5 */ diff --git a/lapack-netlib/SRC/slasd6.c b/lapack-netlib/SRC/slasd6.c index cdbdda3147..3877d7204a 100644 --- a/lapack-netlib/SRC/slasd6.c +++ b/lapack-netlib/SRC/slasd6.c @@ -827,7 +827,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd6_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void slasd6_(integer *icompq, integer *nl, integer *nr, integer *sqre, real *d__, real *vf, real *vl, real *alpha, real *beta, integer *idxq, integer *perm, integer *givptr, integer *givcol, integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real * @@ -841,10 +841,10 @@ f"> */ /* Local variables */ integer idxc, idxp, ivfw, ivlw, i__, m, n; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n1, n2; - extern /* Subroutine */ int slasd7_(integer *, integer *, integer *, + extern /* Subroutine */ void slasd7_(integer *, integer *, integer *, integer *, integer *, real *, real *, real *, real *, real *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, real *, @@ -852,7 +852,8 @@ f"> */ *, real *, real *, real *, real *, real *, real *, integer *, real *, real *, integer *); integer iw, isigma; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slascl_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slascl_( char *, integer *, integer *, real *, real *, integer *, integer * , real *, integer *, integer *), slamrg_(integer *, integer *, real *, integer *, integer *, integer *); @@ -913,7 +914,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD6", &i__1, (ftnlen)6); - return 0; + return; } /* The following values are for bookkeeping purposes only. They are */ @@ -962,7 +963,7 @@ f"> */ /* Report the possible convergence failure. */ if (*info != 0) { - return 0; + return; } /* Save the poles if ICOMPQ = 1. */ @@ -982,7 +983,7 @@ f"> */ n2 = n - *k; slamrg_(&n1, &n2, &d__[1], &c__1, &c_n1, &idxq[1]); - return 0; + return; /* End of SLASD6 */ diff --git a/lapack-netlib/SRC/slasd7.c b/lapack-netlib/SRC/slasd7.c index 243f989255..073ef3438f 100644 --- a/lapack-netlib/SRC/slasd7.c +++ b/lapack-netlib/SRC/slasd7.c @@ -791,7 +791,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd7_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void slasd7_(integer *icompq, integer *nl, integer *nr, integer *sqre, integer *k, real *d__, real *z__, real *zw, real *vf, real *vfw, real *vl, real *vlw, real *alpha, real *beta, real *dsigma, integer *idx, integer *idxp, integer *idxq, integer *perm, integer * @@ -804,16 +804,17 @@ f"> */ /* Local variables */ integer idxi, idxj; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer i__, j, m, n, idxjp, jprev, k2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real z1; extern real slapy2_(real *, real *); integer jp; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slamrg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slamrg_( integer *, integer *, real *, integer *, integer *, integer *); real hlftol, eps, tau, tol; integer nlp1, nlp2; @@ -872,7 +873,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD7", &i__1, (ftnlen)6); - return 0; + return; } nlp1 = *nl + 1; @@ -1127,7 +1128,7 @@ f"> */ i__1 = n - 1; scopy_(&i__1, &vlw[2], &c__1, &vl[2], &c__1); - return 0; + return; /* End of SLASD7 */ diff --git a/lapack-netlib/SRC/slasd8.c b/lapack-netlib/SRC/slasd8.c index 4f556a8f94..630d5a1dd7 100644 --- a/lapack-netlib/SRC/slasd8.c +++ b/lapack-netlib/SRC/slasd8.c @@ -681,7 +681,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasd8_(integer *icompq, integer *k, real *d__, real * +/* Subroutine */ void slasd8_(integer *icompq, integer *k, real *d__, real * z__, real *vf, real *vl, real *difl, real *difr, integer *lddifr, real *dsigma, real *work, integer *info) { @@ -696,15 +696,15 @@ f"> */ extern real snrm2_(integer *, real *, integer *); integer i__, j; real diflj, difrj, dsigj; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamc3_(real *, real *); - extern /* Subroutine */ int slasd4_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slasd4_(integer *, integer *, real *, real *, real *, real *, real *, real *, integer *); real dj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real dsigjp; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real rho; @@ -747,7 +747,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASD8", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -759,7 +759,7 @@ f"> */ difl[2] = 1.f; difr[(difr_dim1 << 1) + 1] = 1.f; } - return 0; + return; } /* Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can */ @@ -814,7 +814,7 @@ f"> */ /* If the root finder fails, report the convergence failure. */ if (*info != 0) { - return 0; + return; } work[iwk3i + j] = work[iwk3i + j] * work[j] * work[iwk2i + j]; difl[j] = -work[j]; @@ -881,7 +881,7 @@ f"> */ scopy_(k, &work[iwk2], &c__1, &vf[1], &c__1); scopy_(k, &work[iwk3], &c__1, &vl[1], &c__1); - return 0; + return; /* End of SLASD8 */ diff --git a/lapack-netlib/SRC/slasda.c b/lapack-netlib/SRC/slasda.c index 71424c3f14..01924a748b 100644 --- a/lapack-netlib/SRC/slasda.c +++ b/lapack-netlib/SRC/slasda.c @@ -789,7 +789,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasda_(integer *icompq, integer *smlsiz, integer *n, +/* Subroutine */ void slasda_(integer *icompq, integer *smlsiz, integer *n, integer *sqre, real *d__, real *e, real *u, integer *ldu, real *vt, integer *k, real *difl, real *difr, real *z__, real *poles, integer * givptr, integer *givcol, integer *ldgcol, integer *perm, real *givnum, @@ -806,14 +806,15 @@ f"> */ integer idxq, nlvl, i__, j, m; real alpha; integer inode, ndiml, ndimr, idxqi, itemp, sqrei, i1; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slasd6_(integer *, integer *, integer *, integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, real * , real *, real *, integer *, real *, real *, real *, integer *, integer *); integer ic, nwork1, lf, nd, nwork2, ll, nl, vf, nr, vl; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slasdq_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slasdq_( char *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer *), slasdt_(integer *, integer @@ -891,7 +892,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASDA", &i__1, (ftnlen)6); - return 0; + return; } m = *n + *sqre; @@ -908,7 +909,7 @@ f"> */ , ldu, &u[u_offset], ldu, &u[u_offset], ldu, &work[1], info); } - return 0; + return; } /* Book-keeping and set up the computation tree. */ @@ -975,7 +976,7 @@ f"> */ ; } if (*info != 0) { - return 0; + return; } i__2 = nl; for (j = 1; j <= i__2; ++j) { @@ -1011,7 +1012,7 @@ f"> */ ; } if (*info != 0) { - return 0; + return; } i__2 = nr; for (j = 1; j <= i__2; ++j) { @@ -1023,7 +1024,7 @@ f"> */ /* Now conquer each subproblem bottom-up. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); for (lvl = nlvl; lvl >= 1; --lvl) { lvl2 = (lvl << 1) - 1; @@ -1035,7 +1036,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1076,14 +1077,14 @@ f"> */ &s[j], &work[nwork1], &iwork[iwk], info); } if (*info != 0) { - return 0; + return; } /* L40: */ } /* L50: */ } - return 0; + return; /* End of SLASDA */ diff --git a/lapack-netlib/SRC/slasdq.c b/lapack-netlib/SRC/slasdq.c index bbc278fdd3..61cd7d763b 100644 --- a/lapack-netlib/SRC/slasdq.c +++ b/lapack-netlib/SRC/slasdq.c @@ -724,7 +724,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasdq_(char *uplo, integer *sqre, integer *n, integer * +/* Subroutine */ void slasdq_(char *uplo, integer *sqre, integer *n, integer * ncvt, integer *nru, integer *ncc, real *d__, real *e, real *vt, integer *ldvt, real *u, integer *ldu, real *c__, integer *ldc, real * work, integer *info) @@ -739,16 +739,17 @@ f"> */ integer sqre1, i__, j; real r__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void slasr_(char *, char *, char *, integer *, integer *, real *, real *, real *, integer *); integer iuplo; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real cs, sn; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slartg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slartg_( real *, real *, real *, real *, real *); logical rotate; - extern /* Subroutine */ int sbdsqr_(char *, integer *, integer *, integer + extern /* Subroutine */ void sbdsqr_(char *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer * , real *, integer *, real *, integer *); integer np1; @@ -810,10 +811,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASDQ", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* ROTATE is true if any singular vectors desired, false otherwise */ @@ -953,7 +954,7 @@ f"> */ /* L40: */ } - return 0; + return; /* End of SLASDQ */ diff --git a/lapack-netlib/SRC/slasdt.c b/lapack-netlib/SRC/slasdt.c index 6f3ba7e87d..b486206b67 100644 --- a/lapack-netlib/SRC/slasdt.c +++ b/lapack-netlib/SRC/slasdt.c @@ -614,7 +614,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasdt_(integer *n, integer *lvl, integer *nd, integer * +/* Subroutine */ void slasdt_(integer *n, integer *lvl, integer *nd, integer * inode, integer *ndiml, integer *ndimr, integer *msub) { /* System generated locals */ @@ -678,7 +678,7 @@ f"> */ } *nd = (llst << 1) - 1; - return 0; + return; /* End of SLASDT */ diff --git a/lapack-netlib/SRC/slaset.c b/lapack-netlib/SRC/slaset.c index 573fa85f88..7eb9c99393 100644 --- a/lapack-netlib/SRC/slaset.c +++ b/lapack-netlib/SRC/slaset.c @@ -620,7 +620,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slaset_(char *uplo, integer *m, integer *n, real *alpha, +/* Subroutine */ void slaset_(char *uplo, integer *m, integer *n, real *alpha, real *beta, real *a, integer *lda) { /* System generated locals */ @@ -701,7 +701,7 @@ f"> */ /* L70: */ } - return 0; + return; /* End of SLASET */ diff --git a/lapack-netlib/SRC/slasq1.c b/lapack-netlib/SRC/slasq1.c index 66ca78f8bb..4489c3f96e 100644 --- a/lapack-netlib/SRC/slasq1.c +++ b/lapack-netlib/SRC/slasq1.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slasq1_(integer *n, real *d__, real *e, real *work, +/* Subroutine */ void slasq1_(integer *n, real *d__, real *e, real *work, integer *info) { /* System generated locals */ @@ -631,17 +631,18 @@ f"> */ real r__1, r__2, r__3; /* Local variables */ - extern /* Subroutine */ int slas2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slas2_(real *, real *, real *, real *, real *) ; integer i__; real scale; integer iinfo; real sigmn, sigmx; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slasq2_(integer *, real *, integer *); extern real slamch_(char *); real safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slascl_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slascl_( char *, integer *, integer *, real *, real *, integer *, integer * , real *, integer *, integer *), slasrt_(char *, integer * , real *, integer *); @@ -668,17 +669,17 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("SLASQ1", &i__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { - return 0; + return; } else if (*n == 1) { d__[1] = abs(d__[1]); - return 0; + return; } else if (*n == 2) { slas2_(&d__[1], &e[1], &d__[2], &sigmn, &sigmx); d__[1] = sigmx; d__[2] = sigmn; - return 0; + return; } /* Estimate the largest singular value. */ @@ -698,7 +699,7 @@ f"> */ if (sigmx == 0.f) { slasrt_("D", n, &d__[1], &iinfo); - return 0; + return; } i__1 = *n; @@ -759,7 +760,7 @@ f"> */ slascl_("G", &c__0, &c__0, &scale, &sigmx, n, &c__1, &e[1], n, &iinfo); } - return 0; + return; /* End of SLASQ1 */ diff --git a/lapack-netlib/SRC/slasq2.c b/lapack-netlib/SRC/slasq2.c index d3949a694e..98c53d6855 100644 --- a/lapack-netlib/SRC/slasq2.c +++ b/lapack-netlib/SRC/slasq2.c @@ -627,7 +627,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasq2_(integer *n, real *z__, integer *info) +/* Subroutine */ void slasq2_(integer *n, real *z__, integer *info) { /* System generated locals */ integer i__1, i__2, i__3; @@ -648,7 +648,7 @@ f"> */ integer iinfo; real tempe, tempq; integer i0, i1, i4, n0, n1, ttype; - extern /* Subroutine */ int slasq3_(integer *, integer *, real *, integer + extern /* Subroutine */ void slasq3_(integer *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *, integer * , logical *, integer *, real *, real *, real *, real *, real *, real *, real *); @@ -660,7 +660,7 @@ f"> */ real oldemn, safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real dn1, dn2; - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); real dee, eps, tau, tol; integer ipn4; real tol2; @@ -693,9 +693,9 @@ f"> */ if (*n < 0) { *info = -1; xerbla_("SLASQ2", &c__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { - return 0; + return; } else if (*n == 1) { /* 1-by-1 case. */ @@ -704,7 +704,7 @@ f"> */ *info = -201; xerbla_("SLASQ2", &c__2, (ftnlen)6); } - return 0; + return; } else if (*n == 2) { /* 2-by-2 case. */ @@ -712,15 +712,15 @@ f"> */ if (z__[1] < 0.f) { *info = -201; xerbla_("SLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[2] < 0.f) { *info = -202; xerbla_("SLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[3] < 0.f) { *info = -203; xerbla_("SLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[3] > z__[1]) { d__ = z__[3]; z__[3] = z__[1]; @@ -741,7 +741,7 @@ f"> */ } z__[2] = z__[3]; z__[6] = z__[2] + z__[1]; - return 0; + return; } /* Check for negative data and compute sums of q's and e's. */ @@ -758,11 +758,11 @@ f"> */ if (z__[k] < 0.f) { *info = -(k + 200); xerbla_("SLASQ2", &c__2, (ftnlen)6); - return 0; + return; } else if (z__[k + 1] < 0.f) { *info = -(k + 201); xerbla_("SLASQ2", &c__2, (ftnlen)6); - return 0; + return; } d__ += z__[k]; e += z__[k + 1]; @@ -780,7 +780,7 @@ f"> */ if (z__[(*n << 1) - 1] < 0.f) { *info = -((*n << 1) + 199); xerbla_("SLASQ2", &c__2, (ftnlen)6); - return 0; + return; } d__ += z__[(*n << 1) - 1]; /* Computing MAX */ @@ -798,7 +798,7 @@ f"> */ } slasrt_("D", n, &z__[1], &iinfo); z__[(*n << 1) - 1] = d__; - return 0; + return; } trace = d__ + e; @@ -807,7 +807,7 @@ f"> */ if (trace == 0.f) { z__[(*n << 1) - 1] = 0.f; - return 0; + return; } /* Check whether the machine is IEEE conformable. */ @@ -947,7 +947,7 @@ f"> */ } if (sigma < 0.f) { *info = 1; - return 0; + return; } /* Find last unreduced submatrix's top index I0, find QMAX and */ @@ -1136,7 +1136,7 @@ f"> */ z__[k * 2] = 0.f; } } - return 0; + return; /* end IWHILB */ @@ -1147,7 +1147,7 @@ f"> */ } *info = 3; - return 0; + return; /* end IWHILA */ @@ -1180,7 +1180,7 @@ f"> */ i__1 = *n; z__[(*n << 1) + 4] = (real) ndiv / (real) (i__1 * i__1); z__[(*n << 1) + 5] = nfail * 100.f / (real) iter; - return 0; + return; /* End of SLASQ2 */ diff --git a/lapack-netlib/SRC/slasq3.c b/lapack-netlib/SRC/slasq3.c index 38f186f3d1..12688de07c 100644 --- a/lapack-netlib/SRC/slasq3.c +++ b/lapack-netlib/SRC/slasq3.c @@ -689,7 +689,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slasq3_(integer *i0, integer *n0, real *z__, integer *pp, +/* Subroutine */ void slasq3_(integer *i0, integer *n0, real *z__, integer *pp, real *dmin__, real *sigma, real *desig, real *qmax, integer *nfail, integer *iter, integer *ndiv, logical *ieee, integer *ttype, real * dmin1, real *dmin2, real *dn, real *dn1, real *dn2, real *g, real * @@ -702,7 +702,7 @@ f"> */ /* Local variables */ real temp, s, t; integer j4; - extern /* Subroutine */ int slasq4_(integer *, integer *, real *, integer + extern /* Subroutine */ void slasq4_(integer *, integer *, real *, integer *, integer *, real *, real *, real *, real *, real *, real *, real *, integer *, real *), slasq5_(integer *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, real *, @@ -741,7 +741,7 @@ f"> */ L10: if (*n0 < *i0) { - return 0; + return; } if (*n0 == *i0) { goto L20; @@ -937,7 +937,7 @@ f"> */ } *sigma = t; - return 0; + return; /* End of SLASQ3 */ diff --git a/lapack-netlib/SRC/slasq4.c b/lapack-netlib/SRC/slasq4.c index 8c3722fc25..73f5ca011f 100644 --- a/lapack-netlib/SRC/slasq4.c +++ b/lapack-netlib/SRC/slasq4.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasq4_(integer *i0, integer *n0, real *z__, integer *pp, +/* Subroutine */ void slasq4_(integer *i0, integer *n0, real *z__, integer *pp, integer *n0in, real *dmin__, real *dmin1, real *dmin2, real *dn, real *dn1, real *dn2, real *tau, integer *ttype, real *g) { @@ -693,7 +693,7 @@ f"> */ if (*dmin__ <= 0.f) { *tau = -(*dmin__); *ttype = -1; - return 0; + return; } nn = (*n0 << 2) + *pp; @@ -746,7 +746,7 @@ f"> */ gam = *dn; a2 = 0.f; if (z__[nn - 5] > z__[nn - 7]) { - return 0; + return; } b2 = z__[nn - 5] / z__[nn - 7]; np = nn - 9; @@ -754,11 +754,11 @@ f"> */ np = nn - (*pp << 1); gam = *dn1; if (z__[np - 4] > z__[np - 2]) { - return 0; + return; } a2 = z__[np - 4] / z__[np - 2]; if (z__[nn - 9] > z__[nn - 11]) { - return 0; + return; } b2 = z__[nn - 9] / z__[nn - 11]; np = nn - 13; @@ -774,7 +774,7 @@ f"> */ } b1 = b2; if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b2 *= z__[i4] / z__[i4 - 2]; a2 += b2; @@ -806,7 +806,7 @@ f"> */ b2 = z__[np - 6]; gam = *dn2; if (z__[np - 8] > b2 || z__[np - 4] > b1) { - return 0; + return; } a2 = z__[np - 8] / b2 * (z__[np - 4] / b1 + 1.f); @@ -822,7 +822,7 @@ f"> */ } b1 = b2; if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b2 *= z__[i4] / z__[i4 - 2]; a2 += b2; @@ -864,7 +864,7 @@ f"> */ *ttype = -7; s = *dmin1 * .333f; if (z__[nn - 5] > z__[nn - 7]) { - return 0; + return; } b1 = z__[nn - 5] / z__[nn - 7]; b2 = b1; @@ -875,7 +875,7 @@ f"> */ for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) { a2 = b1; if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b1 *= z__[i4] / z__[i4 - 2]; b2 += b1; @@ -921,7 +921,7 @@ f"> */ *ttype = -10; s = *dmin2 * .333f; if (z__[nn - 5] > z__[nn - 7]) { - return 0; + return; } b1 = z__[nn - 5] / z__[nn - 7]; b2 = b1; @@ -931,7 +931,7 @@ f"> */ i__1 = (*i0 << 2) - 1 + *pp; for (i4 = (*n0 << 2) - 9 + *pp; i4 >= i__1; i4 += -4) { if (z__[i4] > z__[i4 - 2]) { - return 0; + return; } b1 *= z__[i4] / z__[i4 - 2]; b2 += b1; @@ -969,7 +969,7 @@ f"> */ } *tau = s; - return 0; + return; /* End of SLASQ4 */ diff --git a/lapack-netlib/SRC/slasq5.c b/lapack-netlib/SRC/slasq5.c index cd9645ac5a..1a65859c58 100644 --- a/lapack-netlib/SRC/slasq5.c +++ b/lapack-netlib/SRC/slasq5.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slasq5_(integer *i0, integer *n0, real *z__, integer *pp, +/* Subroutine */ void slasq5_(integer *i0, integer *n0, real *z__, integer *pp, real *tau, real *sigma, real *dmin__, real *dmin1, real *dmin2, real *dn, real *dnm1, real *dnm2, logical *ieee, real *eps) { @@ -680,7 +680,7 @@ f"> */ /* Function Body */ if (*n0 - *i0 - 1 <= 0) { - return 0; + return; } dthresh = *eps * (*sigma + *tau); @@ -754,7 +754,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 2] = d__ + z__[j4 - 1]; if (d__ < 0.f) { - return 0; + return; } else { z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]); d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]) - *tau; @@ -770,7 +770,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 3] = d__ + z__[j4]; if (d__ < 0.f) { - return 0; + return; } else { z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]); d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]) - *tau; @@ -791,7 +791,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm2 + z__[j4p2]; if (*dnm2 < 0.f) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau; @@ -803,7 +803,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm1 + z__[j4p2]; if (*dnm1 < 0.f) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau; @@ -885,7 +885,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 2] = d__ + z__[j4 - 1]; if (d__ < 0.f) { - return 0; + return; } else { z__[j4] = z__[j4 + 1] * (z__[j4 - 1] / z__[j4 - 2]); d__ = z__[j4 + 1] * (d__ / z__[j4 - 2]) - *tau; @@ -904,7 +904,7 @@ f"> */ for (j4 = *i0 << 2; j4 <= i__1; j4 += 4) { z__[j4 - 3] = d__ + z__[j4]; if (d__ < 0.f) { - return 0; + return; } else { z__[j4 - 1] = z__[j4 + 2] * (z__[j4] / z__[j4 - 3]); d__ = z__[j4 + 2] * (d__ / z__[j4 - 3]) - *tau; @@ -928,7 +928,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm2 + z__[j4p2]; if (*dnm2 < 0.f) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dnm1 = z__[j4p2 + 2] * (*dnm2 / z__[j4 - 2]) - *tau; @@ -940,7 +940,7 @@ f"> */ j4p2 = j4 + (*pp << 1) - 1; z__[j4 - 2] = *dnm1 + z__[j4p2]; if (*dnm1 < 0.f) { - return 0; + return; } else { z__[j4] = z__[j4p2 + 2] * (z__[j4p2] / z__[j4 - 2]); *dn = z__[j4p2 + 2] * (*dnm1 / z__[j4 - 2]) - *tau; @@ -952,7 +952,7 @@ f"> */ } z__[j4 + 2] = *dn; z__[(*n0 << 2) - *pp] = emin; - return 0; + return; /* End of SLASQ5 */ diff --git a/lapack-netlib/SRC/slasq6.c b/lapack-netlib/SRC/slasq6.c index d6dd702dd7..824ad31c8a 100644 --- a/lapack-netlib/SRC/slasq6.c +++ b/lapack-netlib/SRC/slasq6.c @@ -627,7 +627,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slasq6_(integer *i0, integer *n0, real *z__, integer *pp, +/* Subroutine */ void slasq6_(integer *i0, integer *n0, real *z__, integer *pp, real *dmin__, real *dmin1, real *dmin2, real *dn, real *dnm1, real * dnm2) { @@ -657,7 +657,7 @@ f"> */ /* Function Body */ if (*n0 - *i0 - 1 <= 0) { - return 0; + return; } safmin = slamch_("Safe minimum"); @@ -761,7 +761,7 @@ f"> */ z__[j4 + 2] = *dn; z__[(*n0 << 2) - *pp] = emin; - return 0; + return; /* End of SLASQ6 */ diff --git a/lapack-netlib/SRC/slasr.c b/lapack-netlib/SRC/slasr.c index 0c6b871dd5..df1b434037 100644 --- a/lapack-netlib/SRC/slasr.c +++ b/lapack-netlib/SRC/slasr.c @@ -708,7 +708,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slasr_(char *side, char *pivot, char *direct, integer *m, +/* Subroutine */ void slasr_(char *side, char *pivot, char *direct, integer *m, integer *n, real *c__, real *s, real *a, integer *lda) { /* System generated locals */ @@ -760,13 +760,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("SLASR ", &info, (ftnlen)5); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (lsame_(side, "L")) { @@ -1002,7 +1002,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of SLASR */ diff --git a/lapack-netlib/SRC/slasrt.c b/lapack-netlib/SRC/slasrt.c index 881f140b02..8a97c26bcd 100644 --- a/lapack-netlib/SRC/slasrt.c +++ b/lapack-netlib/SRC/slasrt.c @@ -597,7 +597,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int slasrt_(char *id, integer *n, real *d__, integer *info) +/* Subroutine */ void slasrt_(char *id, integer *n, real *d__, integer *info) { /* System generated locals */ integer i__1, i__2; @@ -643,13 +643,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLASRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } stkpnt = 1; @@ -816,7 +816,7 @@ f"> */ if (stkpnt > 0) { goto L10; } - return 0; + return; /* End of SLASRT */ diff --git a/lapack-netlib/SRC/slassq.c b/lapack-netlib/SRC/slassq.c index d6e5ee7a55..f4a800c756 100644 --- a/lapack-netlib/SRC/slassq.c +++ b/lapack-netlib/SRC/slassq.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slassq_(integer *n, real *x, integer *incx, real *scale, +/* Subroutine */ void slassq_(integer *n, real *x, integer *incx, real *scale, real *sumsq) { /* System generated locals */ @@ -658,7 +658,7 @@ f"> */ /* L10: */ } } - return 0; + return; /* End of SLASSQ */ diff --git a/lapack-netlib/SRC/slasv2.c b/lapack-netlib/SRC/slasv2.c index 5b6addbe5c..4fee79d71e 100644 --- a/lapack-netlib/SRC/slasv2.c +++ b/lapack-netlib/SRC/slasv2.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slasv2_(real *f, real *g, real *h__, real *ssmin, real * +/* Subroutine */ void slasv2_(real *f, real *g, real *h__, real *ssmin, real * ssmax, real *snr, real *csr, real *snl, real *csl) { /* System generated locals */ @@ -825,7 +825,7 @@ f"> */ *ssmax = r_sign(ssmax, &tsign); r__1 = tsign * r_sign(&c_b4, f) * r_sign(&c_b4, h__); *ssmin = r_sign(ssmin, &r__1); - return 0; + return; /* End of SLASV2 */ diff --git a/lapack-netlib/SRC/slaswlq.c b/lapack-netlib/SRC/slaswlq.c index ee6bad4017..bc0d13cb1c 100644 --- a/lapack-netlib/SRC/slaswlq.c +++ b/lapack-netlib/SRC/slaswlq.c @@ -671,7 +671,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaswlq_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void slaswlq_(integer *m, integer *n, integer *mb, integer * nb, real *a, integer *lda, real *t, integer *ldt, real *work, integer *lwork, integer *info) { @@ -680,11 +680,12 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgelqt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgelqt_( integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *); logical lquery; - extern /* Subroutine */ int stplqt_(integer *, integer *, integer *, + extern /* Subroutine */ void stplqt_(integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *); integer ctr; @@ -737,15 +738,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("SLASWLQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -753,7 +754,7 @@ static integer c__0 = 0; if (*m >= *n || *nb <= *m || *nb >= *n) { sgelqt_(m, n, mb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*n - *m) % (*nb - *m); @@ -785,7 +786,7 @@ static integer c__0 = 0; } work[1] = (real) (*m * *mb); - return 0; + return; /* End of SLASWLQ */ diff --git a/lapack-netlib/SRC/slaswp.c b/lapack-netlib/SRC/slaswp.c index 67180608a1..d84e6616f3 100644 --- a/lapack-netlib/SRC/slaswp.c +++ b/lapack-netlib/SRC/slaswp.c @@ -624,7 +624,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slaswp_(integer *n, real *a, integer *lda, integer *k1, +/* Subroutine */ void slaswp_(integer *n, real *a, integer *lda, integer *k1, integer *k2, integer *ipiv, integer *incx) { /* System generated locals */ @@ -665,7 +665,7 @@ f"> */ i2 = *k1; inc = -1; } else { - return 0; + return; } n32 = *n / 32 << 5; @@ -714,7 +714,7 @@ f"> */ } } - return 0; + return; /* End of SLASWP */ diff --git a/lapack-netlib/SRC/slasy2.c b/lapack-netlib/SRC/slasy2.c index 72ce3954d7..e227137fa5 100644 --- a/lapack-netlib/SRC/slasy2.c +++ b/lapack-netlib/SRC/slasy2.c @@ -689,7 +689,7 @@ f"> */ /* > \ingroup realSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slasy2_(logical *ltranl, logical *ltranr, integer *isgn, +/* Subroutine */ void slasy2_(logical *ltranl, logical *ltranr, integer *isgn, integer *n1, integer *n2, real *tl, integer *ldtl, real *tr, integer * ldtr, real *b, integer *ldb, real *scale, real *x, integer *ldx, real *xnorm, integer *info) @@ -715,7 +715,7 @@ f"> */ real xmax; integer ipsv, jpsv, i__, j, k; logical bswap; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical xswap; @@ -758,7 +758,7 @@ f"> */ /* Quick return if possible */ if (*n1 == 0 || *n2 == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -794,7 +794,7 @@ f"> */ x[x_dim1 + 1] = b[b_dim1 + 1] * *scale / tau1; *xnorm = (r__1 = x[x_dim1 + 1], abs(r__1)); - return 0; + return; /* 1 by 2: */ /* TL11*[X11 X12] + ISGN*[X11 X12]*op[TR11 TR12] = [B11 B12] */ @@ -904,7 +904,7 @@ f"> */ , abs(r__2)); *xnorm = f2cmax(r__3,r__4); } - return 0; + return; /* 2 by 2: */ /* op[TL11 TL12]*[X11 X12] +ISGN* [X11 X12]*op[TR11 TR12] = [B11 B12] */ @@ -1047,7 +1047,7 @@ f"> */ /* Computing MAX */ r__1 = abs(tmp[0]) + abs(tmp[2]), r__2 = abs(tmp[1]) + abs(tmp[3]); *xnorm = f2cmax(r__1,r__2); - return 0; + return; /* End of SLASY2 */ diff --git a/lapack-netlib/SRC/slasyf.c b/lapack-netlib/SRC/slasyf.c index c687a3fc7f..6de8fd8f4d 100644 --- a/lapack-netlib/SRC/slasyf.c +++ b/lapack-netlib/SRC/slasyf.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int slasyf_(char *uplo, integer *n, integer *nb, integer *kb, +/* Subroutine */ void slasyf_(char *uplo, integer *n, integer *nb, integer *kb, real *a, integer *lda, integer *ipiv, real *w, integer *ldw, integer *info) { @@ -704,12 +704,12 @@ f"> */ integer imax, jmax, j, k; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer kstep; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); real r1, d11, d21, d22; @@ -1432,7 +1432,7 @@ f"> */ *kb = k - 1; } - return 0; + return; /* End of SLASYF */ diff --git a/lapack-netlib/SRC/slasyf_aa.c b/lapack-netlib/SRC/slasyf_aa.c index 317e549b33..22ec7510a2 100644 --- a/lapack-netlib/SRC/slasyf_aa.c +++ b/lapack-netlib/SRC/slasyf_aa.c @@ -659,7 +659,7 @@ aa.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int slasyf_aa_(char *uplo, integer *j1, integer *m, integer +/* Subroutine */ void slasyf_aa_(char *uplo, integer *j1, integer *m, integer *nb, real *a, integer *lda, integer *ipiv, real *h__, integer *ldh, real *work) { @@ -670,17 +670,17 @@ aa.f"> */ integer j, k; real alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer i1, k1, i2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), saxpy_(integer *, real *, real *, integer *, real *, integer *) ; integer mj; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real piv; @@ -1044,7 +1044,7 @@ aa.f"> */ L40: ; } - return 0; + return; /* End of SLASYF_AA */ diff --git a/lapack-netlib/SRC/slasyf_rk.c b/lapack-netlib/SRC/slasyf_rk.c index 11a7799058..3fa967b54f 100644 --- a/lapack-netlib/SRC/slasyf_rk.c +++ b/lapack-netlib/SRC/slasyf_rk.c @@ -777,7 +777,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int slasyf_rk_(char *uplo, integer *n, integer *nb, integer +/* Subroutine */ void slasyf_rk_(char *uplo, integer *n, integer *nb, integer *kb, real *a, integer *lda, real *e, integer *ipiv, real *w, integer * ldw, integer *info) { @@ -790,16 +790,16 @@ rk.f"> */ integer imax, jmax, j, k, p; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real sfmin; integer itemp; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer kstep; real stemp; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); real r1, d11, d12, d21, d22; @@ -1583,7 +1583,7 @@ rk.f"> */ } - return 0; + return; /* End of SLASYF_RK */ diff --git a/lapack-netlib/SRC/slasyf_rook.c b/lapack-netlib/SRC/slasyf_rook.c index a5562f2f4a..96715c565c 100644 --- a/lapack-netlib/SRC/slasyf_rook.c +++ b/lapack-netlib/SRC/slasyf_rook.c @@ -699,7 +699,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int slasyf_rook_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void slasyf_rook_(char *uplo, integer *n, integer *nb, integer *kb, real *a, integer *lda, integer *ipiv, real *w, integer * ldw, integer *info) { @@ -712,16 +712,16 @@ rook.f"> */ integer imax, jmax, j, k, p; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real sfmin; integer itemp; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer kstep; real stemp; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); real r1, d11, d12, d21, d22; @@ -1518,7 +1518,7 @@ rook.f"> */ *kb = k - 1; } - return 0; + return; /* End of SLASYF_ROOK */ diff --git a/lapack-netlib/SRC/slatbs.c b/lapack-netlib/SRC/slatbs.c index c19f2c9b5d..d3cfdeac40 100644 --- a/lapack-netlib/SRC/slatbs.c +++ b/lapack-netlib/SRC/slatbs.c @@ -755,7 +755,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatbs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void slatbs_(char *uplo, char *trans, char *diag, char * normin, integer *n, integer *kd, real *ab, integer *ldab, real *x, real *scale, real *cnorm, integer *info) { @@ -772,12 +772,12 @@ f"> */ real xmax, grow, sumj; integer i__, j, maind; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tscal, uscal; integer jlast; extern real sasum_(integer *, real *, integer *); logical upper; - extern /* Subroutine */ int stbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stbsv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *); real xj; @@ -836,13 +836,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLATBS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1412,7 +1412,7 @@ f"> */ sscal_(n, &r__1, &cnorm[1], &c__1); } - return 0; + return; /* End of SLATBS */ diff --git a/lapack-netlib/SRC/slatbs.f b/lapack-netlib/SRC/slatbs.f index 617d0b2f50..77940f8cd9 100644 --- a/lapack-netlib/SRC/slatbs.f +++ b/lapack-netlib/SRC/slatbs.f @@ -310,6 +310,7 @@ SUBROUTINE SLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -317,7 +318,6 @@ SUBROUTINE SLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, * SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/slatdf.c b/lapack-netlib/SRC/slatdf.c index 40efd802b1..0a3f04fbe6 100644 --- a/lapack-netlib/SRC/slatdf.c +++ b/lapack-netlib/SRC/slatdf.c @@ -687,7 +687,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatdf_(integer *ijob, integer *n, real *z__, integer * +/* Subroutine */ void slatdf_(integer *ijob, integer *n, real *z__, integer * ldz, real *rhs, real *rdsum, real *rdscal, integer *ipiv, integer * jpiv) { @@ -701,21 +701,22 @@ f"> */ extern real sdot_(integer *, real *, integer *, real *, integer *); real work[32]; integer i__, j, k; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real pmone; extern real sasum_(integer *, real *, integer *); real sminu; integer iwork[8]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *); real splus; - extern /* Subroutine */ int sgesc2_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sgesc2_(integer *, real *, integer *, real *, integer *, integer *, real *); real bm, bp, xm[8], xp[8]; - extern /* Subroutine */ int sgecon_(char *, integer *, real *, integer *, + extern /* Subroutine */ void sgecon_(char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *), slassq_( - integer *, real *, integer *, real *, real *), slaswp_(integer *, + integer *, real *, integer *, real *, real *); + extern int slaswp_(integer *, real *, integer *, integer *, integer *, integer *, integer *); @@ -857,7 +858,7 @@ f"> */ } - return 0; + return; /* End of SLATDF */ diff --git a/lapack-netlib/SRC/slatps.c b/lapack-netlib/SRC/slatps.c index 5756a46268..c3493a99fd 100644 --- a/lapack-netlib/SRC/slatps.c +++ b/lapack-netlib/SRC/slatps.c @@ -742,7 +742,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatps_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void slatps_(char *uplo, char *trans, char *diag, char * normin, integer *n, real *ap, real *x, real *scale, real *cnorm, integer *info) { @@ -759,12 +759,12 @@ f"> */ real xmax, grow, sumj; integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tscal, uscal; integer jlast; extern real sasum_(integer *, real *, integer *); logical upper; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), stpsv_(char *, char *, char *, integer *, real *, real *, integer *); integer ip; @@ -818,13 +818,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLATPS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1381,7 +1381,7 @@ f"> */ sscal_(n, &r__1, &cnorm[1], &c__1); } - return 0; + return; /* End of SLATPS */ diff --git a/lapack-netlib/SRC/slatrd.c b/lapack-netlib/SRC/slatrd.c index d6f13fe50f..a4f86a954d 100644 --- a/lapack-netlib/SRC/slatrd.c +++ b/lapack-netlib/SRC/slatrd.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatrd_(char *uplo, integer *n, integer *nb, real *a, +/* Subroutine */ void slatrd_(char *uplo, integer *n, integer *nb, real *a, integer *lda, real *e, real *tau, real *w, integer *ldw) { /* System generated locals */ @@ -726,14 +726,14 @@ f"> */ integer i__; real alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), saxpy_( integer *, real *, real *, integer *, real *, integer *), ssymv_( char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer iw; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); @@ -760,7 +760,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (lsame_(uplo, "U")) { @@ -906,7 +906,7 @@ f"> */ } } - return 0; + return; /* End of SLATRD */ diff --git a/lapack-netlib/SRC/slatrs.c b/lapack-netlib/SRC/slatrs.c index 44d469bf44..c8463866ed 100644 --- a/lapack-netlib/SRC/slatrs.c +++ b/lapack-netlib/SRC/slatrs.c @@ -752,7 +752,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatrs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void slatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, real *a, integer *lda, real *x, real *scale, real *cnorm, integer *info) { @@ -769,12 +769,12 @@ f"> */ real xmax, grow, sumj; integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tscal, uscal; integer jlast; extern real sasum_(integer *, real *, integer *); logical upper; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); real xj; @@ -831,13 +831,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLATRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1375,7 +1375,7 @@ f"> */ sscal_(n, &r__1, &cnorm[1], &c__1); } - return 0; + return; /* End of SLATRS */ diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f index 94e0e88bc6..0761d656f2 100644 --- a/lapack-netlib/SRC/slatrs.f +++ b/lapack-netlib/SRC/slatrs.f @@ -264,8 +264,8 @@ SUBROUTINE SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX - REAL SASUM, SDOT, SLAMCH - EXTERNAL LSAME, ISAMAX, SASUM, SDOT, SLAMCH + REAL SASUM, SDOT, SLAMCH, SLANGE + EXTERNAL LSAME, ISAMAX, SASUM, SDOT, SLAMCH, SLANGE * .. * .. External Subroutines .. EXTERNAL SAXPY, SSCAL, STRSV, XERBLA @@ -304,6 +304,7 @@ SUBROUTINE SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * @@ -311,7 +312,6 @@ SUBROUTINE SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -343,8 +343,67 @@ SUBROUTINE SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, IF( TMAX.LE.BIGNUM ) THEN TSCAL = ONE ELSE - TSCAL = ONE / ( SMLNUM*TMAX ) - CALL SSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.SLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = ONE / ( SMLNUM*TMAX ) + CALL SSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be represented +* as floating-point number. Find the offdiagonal entry A( I, J ) +* with the largest absolute value. If this entry is not +/- Infinity, +* use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + $ TMAX ) + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1, + $ SUMJ ), TMAX ) + END DO + END IF +* + IF( TMAX.LE.SLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm without introducing Infinity +* in the summation + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * ABS( A( I, J ) ) + END DO + END IF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point entry. +* Rely on TRSV to propagate Inf and NaN. + CALL STRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c new file mode 100644 index 0000000000..e6fc722b10 --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.c @@ -0,0 +1,1262 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* REAL A( LDA, * ), CNORM( * ), SCALE( * ), */ +/* WORK( * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > SLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale) or A**T * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A. X and B are */ +/* > n by nrhs matrices and scale is an nrhs element vector of scaling */ +/* > factors. A scaling factor scale(j) is usually less than or equal */ +/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */ +/* > If the matrix A is singular (A(j,j) = 0 for some j), then */ +/* > a non-trivial solution to A*X = 0 is returned. If the system is */ +/* > so badly scaled that the solution cannot be represented as */ +/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is REAL array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is REAL array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is REAL array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void slatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, real *a, integer *lda, real *x, + integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, + integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + real r__1, r__2; + + /* Local variables */ + integer iinc, jinc; + real scal, anrm, bnrm; + integer awrk; + real tmax, xnrm[32]; + integer i__, j, k; + real w[64]; + extern logical lsame_(char *, char *); + real rscal; + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer lanrm, ilast, jlast, i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk, lscale; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern real slarmm_(real *, real *, real *); + integer ifirst; + logical notran; + integer jfirst; + extern /* Subroutine */ void slatrs_(char *, char *, char *, char *, + integer *, real *, integer *, real *, real *, real *, integer *); + real smlnum; + logical nounit, lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "SLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (real) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (real) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLATRS3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.f; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = slamch_("Overflow"); + smlnum = slamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + slatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.f; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = slange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= slamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + slatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.f; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ +/* for all right-hand sides in the current block column, */ +/* one RHS at a time. */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + slatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = slange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.f) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute A*x = 0 (or A**T*x = 0). Note that */ +/* X(J1:J2-1, KK) is set by LATRS. */ + scale[rhs] = 0.f; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } else if (scaloc * work[j + kk * lds] == 0.f) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1.f / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + sscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.f; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.f; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + x[ii + kk * x_dim1] = 0.f; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.f; + } + scaloc = 1.f; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds]; + scamin = f2cmin(r__1,r__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = slange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to B( I, KK ) and B( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = i2 - i1; + sscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.f) { + i__7 = j2 - j1; + sscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + sgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, + &x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + r__1 = scale[rhs], r__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(r__1,r__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1.f && scale[rhs] != 0.f) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.f) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return; + +/* End of SLATRS3 */ + +} /* slatrs3_ */ + diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f new file mode 100644 index 0000000000..c3a08e524c --- /dev/null +++ b/lapack-netlib/SRC/slatrs3.f @@ -0,0 +1,656 @@ +*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), CNORM( * ), SCALE( * ), +* WORK( * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale) or A**T * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A. X and B are +*> n by nrhs matrices and scale is an nrhs element vector of scaling +*> factors. A scaling factor scale(j) is usually less than or equal +*> to 1, chosen such that X(:,j) is less than the overflow threshold. +*> If the matrix A is singular (A(j,j) = 0 for some j), then +*> a non-trivial solution to A*X = 0 is returned. If the system is +*> so badly scaled that the solution cannot be represented as +*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is REAL array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is REAL array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + REAL A( LDA, * ), CNORM( * ), X( LDX, * ), + $ SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + REAL W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + REAL ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLAMCH, SLANGE, SLARMM + EXTERNAL ILAENV, LSAME, SLAMCH, SLANGE, SLARMM +* .. +* .. External Subroutines .. + EXTERNAL SLATRS, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = SLAMCH( 'Overflow' ) + SMLNUM = SLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1)*NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF +* + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* for all right-hand sides in the current block column, +* one RHS at a time. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute A*x = 0 (or A**T*x = 0). Note that +* X(J1:J2-1, KK) is set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = ZERO + END DO + DO II = J2, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = ZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS )) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to B( I, KK ) and B( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ ONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO +* +* Reduce local scaling factors +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2-K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of SLATRS3 +* + END diff --git a/lapack-netlib/SRC/slatrz.c b/lapack-netlib/SRC/slatrz.c index a36c9d56d1..1f8467fe7e 100644 --- a/lapack-netlib/SRC/slatrz.c +++ b/lapack-netlib/SRC/slatrz.c @@ -649,7 +649,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatrz_(integer *m, integer *n, integer *l, real *a, +/* Subroutine */ void slatrz_(integer *m, integer *n, integer *l, real *a, integer *lda, real *tau, real *work) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int slarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void slarz_(char *, integer *, integer *, integer * , real *, integer *, real *, real *, integer *, real *), slarfg_(integer *, real *, real *, integer *, real *); @@ -684,14 +684,14 @@ f"> */ /* Function Body */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { tau[i__] = 0.f; /* L10: */ } - return 0; + return; } for (i__ = *m; i__ >= 1; --i__) { @@ -713,7 +713,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of SLATRZ */ diff --git a/lapack-netlib/SRC/slatsqr.c b/lapack-netlib/SRC/slatsqr.c index 34b0d43309..d756560e06 100644 --- a/lapack-netlib/SRC/slatsqr.c +++ b/lapack-netlib/SRC/slatsqr.c @@ -673,7 +673,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void slatsqr_(integer *m, integer *n, integer *mb, integer * nb, real *a, integer *lda, real *t, integer *ldt, real *work, integer *lwork, integer *info) { @@ -682,11 +682,12 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgeqrt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgeqrt_( integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *); logical lquery; - extern /* Subroutine */ int stpqrt_(integer *, integer *, integer *, + extern /* Subroutine */ void stpqrt_(integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *); integer ctr; @@ -738,15 +739,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("SLATSQR", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -754,7 +755,7 @@ static integer c__0 = 0; if (*mb <= *n || *mb >= *m) { sgeqrt_(m, n, nb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*m - *n) % (*mb - *n); ii = *m - kk + 1; @@ -785,7 +786,7 @@ static integer c__0 = 0; } work[1] = (real) (*n * *nb); - return 0; + return; /* End of SLATSQR */ diff --git a/lapack-netlib/SRC/slauu2.c b/lapack-netlib/SRC/slauu2.c index 35fd950698..67ba252af3 100644 --- a/lapack-netlib/SRC/slauu2.c +++ b/lapack-netlib/SRC/slauu2.c @@ -617,7 +617,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slauu2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void slauu2_(char *uplo, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -627,7 +627,7 @@ f"> */ extern real sdot_(integer *, real *, integer *, real *, integer *); integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; @@ -664,13 +664,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAUU2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -718,7 +718,7 @@ f"> */ } } - return 0; + return; /* End of SLAUU2 */ diff --git a/lapack-netlib/SRC/slauum.c b/lapack-netlib/SRC/slauum.c index da7d7eefec..5b33e2c514 100644 --- a/lapack-netlib/SRC/slauum.c +++ b/lapack-netlib/SRC/slauum.c @@ -618,7 +618,7 @@ f"> */ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int slauum_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void slauum_(char *uplo, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -627,17 +627,17 @@ f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer * ); integer ib; - extern /* Subroutine */ int slauu2_(char *, integer *, real *, integer *, + extern /* Subroutine */ void slauu2_(char *, integer *, real *, integer *, integer *); integer nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SLAUUM", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -758,7 +758,7 @@ f"> */ } } - return 0; + return; /* End of SLAUUM */ diff --git a/lapack-netlib/SRC/sopgtr.c b/lapack-netlib/SRC/sopgtr.c index 970f0910f9..a9f7625d4b 100644 --- a/lapack-netlib/SRC/sopgtr.c +++ b/lapack-netlib/SRC/sopgtr.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sopgtr_(char *uplo, integer *n, real *ap, real *tau, +/* Subroutine */ void sopgtr_(char *uplo, integer *n, real *ap, real *tau, real *q, integer *ldq, real *work, integer *info) { /* System generated locals */ @@ -634,7 +634,7 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; logical upper; - extern /* Subroutine */ int sorg2l_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorg2l_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *), sorg2r_(integer *, integer *, integer *, real *, integer *, real *, real *, integer * ); @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SOPGTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -757,7 +757,7 @@ f"> */ &work[1], &iinfo); } } - return 0; + return; /* End of SOPGTR */ diff --git a/lapack-netlib/SRC/sopmtr.c b/lapack-netlib/SRC/sopmtr.c index 8d95afe727..5ab428a2e1 100644 --- a/lapack-netlib/SRC/sopmtr.c +++ b/lapack-netlib/SRC/sopmtr.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sopmtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void sopmtr_(char *side, char *uplo, char *trans, integer *m, integer *n, real *ap, real *tau, real *c__, integer *ldc, real *work, integer *info) { @@ -673,7 +673,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer i1; logical upper; @@ -731,13 +731,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SOPMTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (upper) { @@ -853,7 +853,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of SOPMTR */ diff --git a/lapack-netlib/SRC/sorbdb.c b/lapack-netlib/SRC/sorbdb.c index 53a5fb1ea9..deba21c6cb 100644 --- a/lapack-netlib/SRC/sorbdb.c +++ b/lapack-netlib/SRC/sorbdb.c @@ -798,7 +798,7 @@ f"> */ /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sorbdb_(char *trans, char *signs, integer *m, integer *p, +/* Subroutine */ void sorbdb_(char *trans, char *signs, integer *m, integer *p, integer *q, real *x11, integer *ldx11, real *x12, integer *ldx12, real *x21, integer *ldx21, real *x22, integer *ldx22, real *theta, real *phi, real *taup1, real *taup2, real *tauq1, real *tauq2, real * @@ -815,14 +815,14 @@ f"> */ extern real snrm2_(integer *, real *, integer *); integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *), saxpy_(integer *, real *, real *, integer *, real *, integer *); real z1, z2, z3, z4; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; - extern /* Subroutine */ int slarfgp_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfgp_(integer *, real *, real *, integer *, real *); @@ -927,9 +927,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("xORBDB", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Handle column-major and row-major separately */ @@ -1386,7 +1386,7 @@ f"> */ } - return 0; + return; /* End of SORBDB */ diff --git a/lapack-netlib/SRC/sorbdb1.c b/lapack-netlib/SRC/sorbdb1.c index 0151bcae7f..170f60bf6e 100644 --- a/lapack-netlib/SRC/sorbdb1.c +++ b/lapack-netlib/SRC/sorbdb1.c @@ -715,7 +715,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sorbdb1_(integer *m, integer *p, integer *q, real *x11, +/* Subroutine */ void sorbdb1_(integer *m, integer *p, integer *q, real *x11, integer *ldx11, real *x21, integer *ldx21, real *theta, real *phi, real *taup1, real *taup2, real *tauq1, real *work, integer *lwork, integer *info) @@ -726,7 +726,7 @@ static integer c__1 = 1; real r__1, r__2; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer lworkmin, lworkopt; extern real snrm2_(integer *, real *, integer *); @@ -734,13 +734,13 @@ static integer c__1 = 1; integer i__; real s; integer ilarf, llarf; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int sorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void sorbdb5_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *), slarfgp_(integer *, real *, real *, integer *, real *); @@ -813,9 +813,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SORBDB1", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., Q of X11 and X21 */ @@ -883,7 +883,7 @@ static integer c__1 = 1; } - return 0; + return; /* End of SORBDB1 */ diff --git a/lapack-netlib/SRC/sorbdb2.c b/lapack-netlib/SRC/sorbdb2.c index 50e9feedc2..24095a68af 100644 --- a/lapack-netlib/SRC/sorbdb2.c +++ b/lapack-netlib/SRC/sorbdb2.c @@ -714,7 +714,7 @@ static real c_b9 = -1.f; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sorbdb2_(integer *m, integer *p, integer *q, real *x11, +/* Subroutine */ void sorbdb2_(integer *m, integer *p, integer *q, real *x11, integer *ldx11, real *x21, integer *ldx21, real *theta, real *phi, real *taup1, real *taup2, real *tauq1, real *work, integer *lwork, integer *info) @@ -725,7 +725,7 @@ static real c_b9 = -1.f; real r__1, r__2; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer lworkmin, lworkopt; extern real snrm2_(integer *, real *, integer *); @@ -733,14 +733,14 @@ static real c_b9 = -1.f; integer i__; real s; integer ilarf, llarf; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int sorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void sorbdb5_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *), slarfgp_(integer *, real *, real *, integer *, real *); @@ -812,9 +812,9 @@ static real c_b9 = -1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SORBDB2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., P of X11 and X21 */ @@ -898,7 +898,7 @@ static real c_b9 = -1.f; i__], &x21[i__ + (i__ + 1) * x21_dim1], ldx21, &work[ilarf]); } - return 0; + return; /* End of SORBDB2 */ diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f index ad3eb269dd..484d352f8c 100644 --- a/lapack-netlib/SRC/sorbdb2.f +++ b/lapack-netlib/SRC/sorbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is REAL array, dimension (P) +*> TAUP1 is REAL array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is REAL array, dimension (M-P) +*> TAUP2 is REAL array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/sorbdb3.c b/lapack-netlib/SRC/sorbdb3.c index a3acf4e8a9..0ac80f40e0 100644 --- a/lapack-netlib/SRC/sorbdb3.c +++ b/lapack-netlib/SRC/sorbdb3.c @@ -714,7 +714,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sorbdb3_(integer *m, integer *p, integer *q, real *x11, +/* Subroutine */ void sorbdb3_(integer *m, integer *p, integer *q, real *x11, integer *ldx11, real *x21, integer *ldx21, real *theta, real *phi, real *taup1, real *taup2, real *tauq1, real *work, integer *lwork, integer *info) @@ -725,7 +725,7 @@ static integer c__1 = 1; real r__1, r__2; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer lworkmin, lworkopt; extern real snrm2_(integer *, real *, integer *); @@ -733,13 +733,13 @@ static integer c__1 = 1; integer i__; real s; integer ilarf, llarf; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int sorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void sorbdb5_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *), slarfgp_(integer *, real *, real *, integer *, real *); @@ -811,9 +811,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SORBDB3", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., M-P of X11 and X21 */ @@ -896,7 +896,7 @@ static integer c__1 = 1; i__], &x11[i__ + (i__ + 1) * x11_dim1], ldx11, &work[ilarf]); } - return 0; + return; /* End of SORBDB3 */ diff --git a/lapack-netlib/SRC/sorbdb4.c b/lapack-netlib/SRC/sorbdb4.c index dee93d5325..02328c2536 100644 --- a/lapack-netlib/SRC/sorbdb4.c +++ b/lapack-netlib/SRC/sorbdb4.c @@ -726,7 +726,7 @@ static real c_b5 = -1.f; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sorbdb4_(integer *m, integer *p, integer *q, real *x11, +/* Subroutine */ void sorbdb4_(integer *m, integer *p, integer *q, real *x11, integer *ldx11, real *x21, integer *ldx21, real *theta, real *phi, real *taup1, real *taup2, real *tauq1, real *phantom, real *work, integer *lwork, integer *info) @@ -737,7 +737,7 @@ static real c_b5 = -1.f; real r__1, r__2; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer lworkmin, lworkopt; extern real snrm2_(integer *, real *, integer *); @@ -745,14 +745,14 @@ static real c_b5 = -1.f; integer i__, j; real s; integer ilarf, llarf; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer childinfo; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int sorbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void sorbdb5_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *), slarfgp_(integer *, real *, real *, integer *, real *); @@ -827,9 +827,9 @@ static real c_b5 = -1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SORBDB4", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., M-Q of X11 and X21 */ @@ -958,7 +958,7 @@ static real c_b5 = -1.f; x21_dim1], ldx21, &work[ilarf]); } - return 0; + return; /* End of SORBDB4 */ diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f index b18ed3b270..bf60fb7bb4 100644 --- a/lapack-netlib/SRC/sorbdb4.f +++ b/lapack-netlib/SRC/sorbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is REAL array, dimension (P) +*> TAUP1 is REAL array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is REAL array, dimension (M-P) +*> TAUP2 is REAL array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/sorbdb5.c b/lapack-netlib/SRC/sorbdb5.c index 71a684469a..26cfc75917 100644 --- a/lapack-netlib/SRC/sorbdb5.c +++ b/lapack-netlib/SRC/sorbdb5.c @@ -664,7 +664,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorbdb5_(integer *m1, integer *m2, integer *n, real *x1, +/* Subroutine */ void sorbdb5_(integer *m1, integer *m2, integer *n, real *x1, integer *incx1, real *x2, integer *incx2, real *q1, integer *ldq1, real *q2, integer *ldq2, real *work, integer *lwork, integer *info) { @@ -674,7 +674,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern real snrm2_(integer *, real *, integer *); integer i__, j, childinfo; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sorbdb6_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sorbdb6_( integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer * , integer *); @@ -725,7 +726,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SORBDB5", &i__1, (ftnlen)7); - return 0; + return; } /* Project X onto the orthogonal complement of Q */ @@ -737,7 +738,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (snrm2_(m1, &x1[1], incx1) != 0.f || snrm2_(m2, &x2[1], incx2) != 0.f) { - return 0; + return; } /* Project each standard basis vector e_1,...,e_M1 in turn, stopping */ @@ -758,7 +759,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ ldq1, &q2[q2_offset], ldq2, &work[1], lwork, &childinfo); if (snrm2_(m1, &x1[1], incx1) != 0.f || snrm2_(m2, &x2[1], incx2) != 0.f) { - return 0; + return; } } @@ -780,11 +781,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ ldq1, &q2[q2_offset], ldq2, &work[1], lwork, &childinfo); if (snrm2_(m1, &x1[1], incx1) != 0.f || snrm2_(m2, &x2[1], incx2) != 0.f) { - return 0; + return; } } - return 0; + return; /* End of SORBDB5 */ diff --git a/lapack-netlib/SRC/sorbdb6.c b/lapack-netlib/SRC/sorbdb6.c index c22606bd3f..2f61c38349 100644 --- a/lapack-netlib/SRC/sorbdb6.c +++ b/lapack-netlib/SRC/sorbdb6.c @@ -669,7 +669,7 @@ static real c_b12 = -1.f; /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorbdb6_(integer *m1, integer *m2, integer *n, real *x1, +/* Subroutine */ void sorbdb6_(integer *m1, integer *m2, integer *n, real *x1, integer *incx1, real *x2, integer *incx2, real *q1, integer *ldq1, real *q2, integer *ldq2, real *work, integer *lwork, integer *info) { @@ -679,8 +679,10 @@ static real c_b12 = -1.f; /* Local variables */ integer i__; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, - real *, integer *, real *, integer *, real *, real *, integer *), xerbla_(char *, integer *, ftnlen), slassq_(integer *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slassq_(integer *, real *, integer *, real *, real *); real normsq1, normsq2, scl1, scl2, ssq1, ssq2; @@ -730,7 +732,7 @@ static real c_b12 = -1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SORBDB6", &i__1, (ftnlen)7); - return 0; + return; } /* First, project X onto the orthogonal complement of Q's column */ @@ -783,11 +785,11 @@ static real c_b12 = -1.f; /* Otherwise, project again. */ if (normsq2 >= normsq1 * .01f) { - return 0; + return; } if (normsq2 == 0.f) { - return 0; + return; } normsq1 = normsq2; @@ -842,7 +844,7 @@ static real c_b12 = -1.f; } } - return 0; + return; /* End of SORBDB6 */ diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f index a23b42bebc..b2449e3bed 100644 --- a/lapack-netlib/SRC/sorbdb6.f +++ b/lapack-netlib/SRC/sorbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,15 +173,18 @@ SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * ===================================================================== * * .. Parameters .. - REAL ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01E0, REALONE = 1.0E0, + REAL ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01E0, REALONE = 1.0E0, $ REALZERO = 0.0E0 ) REAL NEGONE, ONE, ZERO PARAMETER ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 ) * .. * .. Local Scalars .. - INTEGER I - REAL NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + REAL EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + REAL SLAMCH * .. * .. External Subroutines .. EXTERNAL SGEMV, SLASSQ, XERBLA @@ -210,17 +219,17 @@ SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL XERBLA( 'SORBDB6', -INFO ) RETURN END IF +* + EPS = SLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -238,27 +247,31 @@ SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -280,24 +293,22 @@ SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL SLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL SLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -306,4 +317,3 @@ SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * End of SORBDB6 * END - diff --git a/lapack-netlib/SRC/sorcsd.c b/lapack-netlib/SRC/sorcsd.c index 609fa835bc..108173ba29 100644 --- a/lapack-netlib/SRC/sorcsd.c +++ b/lapack-netlib/SRC/sorcsd.c @@ -810,7 +810,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorcsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void sorcsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, char *signs, integer *m, integer *p, integer *q, real *x11, integer *ldx11, real *x12, integer *ldx12, real *x21, integer *ldx21, real *x22, integer *ldx22, real *theta, real *u1, @@ -837,30 +837,31 @@ f"> */ lbbcsdworkopt; logical wantu1, wantu2; integer ibbcsd, lorbdbworkopt; - extern /* Subroutine */ int sbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void sbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, real *, real *, real *, real *, real *, real *, real *, real *, integer *, integer *); integer iorbdb, lorglqworkmin, lorgqrworkmin; - extern /* Subroutine */ int sorbdb_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorbdb_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, real *, real *, real *, real *, real - *, real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lorglqworkopt, lorgqrworkopt; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer iorglq; - extern /* Subroutine */ int slapmr_(logical *, integer *, integer *, real + extern /* Subroutine */ void slapmr_(logical *, integer *, integer *, real *, integer *, integer *), slapmt_(logical *, integer *, integer *, real *, integer *, integer *); integer iorgqr; char signst[1]; - extern /* Subroutine */ int sorglq_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorglq_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); char transt[1]; integer lbbcsdwork; - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery; integer lorbdbwork, lorglqwork, lorgqrwork; @@ -986,7 +987,7 @@ f"> */ ldx12, &x22[x22_offset], ldx22, &theta[1], &v1t[v1t_offset], ldv1t, &v2t[v2t_offset], ldv2t, &u1[u1_offset], ldu1, &u2[ u2_offset], ldu2, &work[1], lwork, &iwork[1], info); - return 0; + return; } /* Work with permutation [ 0 I; I 0 ] * X * [ 0 I; I 0 ] if */ @@ -1006,7 +1007,7 @@ f"> */ u2_offset], ldu2, &u1[u1_offset], ldu1, &v2t[v2t_offset], ldv2t, &v1t[v1t_offset], ldv1t, &work[1], lwork, &iwork[1], info); - return 0; + return; } /* Compute workspace */ @@ -1113,9 +1114,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Transform to bidiagonal block form */ @@ -1275,7 +1276,7 @@ f"> */ } } - return 0; + return; /* End SORCSD */ diff --git a/lapack-netlib/SRC/sorcsd2by1.c b/lapack-netlib/SRC/sorcsd2by1.c index 7e0186a3d3..7dfb5db122 100644 --- a/lapack-netlib/SRC/sorcsd2by1.c +++ b/lapack-netlib/SRC/sorcsd2by1.c @@ -746,7 +746,7 @@ by1.f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorcsd2by1_(char *jobu1, char *jobu2, char *jobv1t, +/* Subroutine */ void sorcsd2by1_(char *jobu1, char *jobu2, char *jobv1t, integer *m, integer *p, integer *q, real *x11, integer *ldx11, real * x21, integer *ldx21, real *theta, real *u1, integer *ldu1, real *u2, integer *ldu2, real *v1t, integer *ldv1t, real *work, integer *lwork, @@ -761,34 +761,35 @@ by1.f"> */ lworkmin, lworkopt, i__, j, r__; extern logical lsame_(char *, char *); integer childinfo; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer lorglqmin, lorgqrmin, lorglqopt, itaup1, itaup2, itauq1, lorgqropt; logical wantu1, wantu2; integer ibbcsd, lbbcsd; - extern /* Subroutine */ int sbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void sbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, real *, real *, real *, real *, real *, real *, real *, real *, integer *, integer *); integer iorbdb, lorbdb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ); integer iorglq; - extern /* Subroutine */ int slapmr_(logical *, integer *, integer *, real + extern /* Subroutine */ void slapmr_(logical *, integer *, integer *, real *, integer *, integer *); integer lorglq; - extern /* Subroutine */ int slapmt_(logical *, integer *, integer *, real + extern /* Subroutine */ void slapmt_(logical *, integer *, integer *, real *, integer *, integer *); integer iorgqr, lorgqr; - extern /* Subroutine */ int sorglq_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorglq_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), sorgqr_( integer *, integer *, integer *, real *, integer *, real *, real * , integer *, integer *); logical lquery; - extern /* Subroutine */ int sorbdb1_(integer *, integer *, integer *, + extern /* Subroutine */ void sorbdb1_(integer *, integer *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, integer *, integer *), sorbdb2_(integer *, integer *, integer *, real *, integer *, real *, integer *, real @@ -1113,9 +1114,9 @@ by1.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORCSD2BY1", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { - return 0; + return; } lorgqr = *lwork - iorgqr + 1; lorglq = *lwork - iorglq + 1; @@ -1425,7 +1426,7 @@ by1.f"> */ } } - return 0; + return; /* End of SORCSD2BY1 */ diff --git a/lapack-netlib/SRC/sorg2l.c b/lapack-netlib/SRC/sorg2l.c index a62c42deb4..f5816ef82e 100644 --- a/lapack-netlib/SRC/sorg2l.c +++ b/lapack-netlib/SRC/sorg2l.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorg2l_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorg2l_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -637,7 +637,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer ii; @@ -676,13 +676,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORG2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns 1:n-k to columns of the unit matrix */ @@ -723,7 +723,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of SORG2L */ diff --git a/lapack-netlib/SRC/sorg2r.c b/lapack-netlib/SRC/sorg2r.c index 144d89e1e0..425657cc10 100644 --- a/lapack-netlib/SRC/sorg2r.c +++ b/lapack-netlib/SRC/sorg2r.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorg2r_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorg2r_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -637,9 +637,10 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, - real *, integer *, real *), xerbla_(char *, integer *, ftnlen); + real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -674,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORG2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns k+1:n to columns of the unit matrix */ @@ -723,7 +724,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of SORG2R */ diff --git a/lapack-netlib/SRC/sorgbr.c b/lapack-netlib/SRC/sorgbr.c index a10e71b2e7..59f5b289f4 100644 --- a/lapack-netlib/SRC/sorgbr.c +++ b/lapack-netlib/SRC/sorgbr.c @@ -670,7 +670,7 @@ f"> */ /* > \ingroup realGBcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgbr_(char *vect, integer *m, integer *n, integer *k, +/* Subroutine */ void sorgbr_(char *vect, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { @@ -683,7 +683,8 @@ f"> */ integer iinfo; logical wantq; integer mn; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sorglq_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sorglq_( integer *, integer *, integer *, real *, integer *, real *, real * , integer *, integer *), sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); @@ -765,17 +766,17 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { work[1] = (real) lwkopt; - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1] = 1.f; - return 0; + return; } if (wantq) { @@ -872,7 +873,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORGBR */ diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index 8f15523d4b..b1a5c03a26 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -232,7 +232,7 @@ SUBROUTINE SORGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF END IF END IF - LWKOPT = WORK( 1 ) + LWKOPT = INT( WORK( 1 ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/sorghr.c b/lapack-netlib/SRC/sorghr.c index c4fbb26caa..ea21f32a25 100644 --- a/lapack-netlib/SRC/sorghr.c +++ b/lapack-netlib/SRC/sorghr.c @@ -640,7 +640,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorghr_(integer *n, integer *ilo, integer *ihi, real *a, +/* Subroutine */ void sorghr_(integer *n, integer *ilo, integer *ihi, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -651,7 +651,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgqr_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *); integer lwkopt; logical lquery; @@ -701,16 +701,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGHR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.f; - return 0; + return; } /* Shift the vectors which define the elementary reflectors one */ @@ -765,7 +765,7 @@ f"> */ ilo], &work[1], lwork, &iinfo); } work[1] = (real) lwkopt; - return 0; + return; /* End of SORGHR */ diff --git a/lapack-netlib/SRC/sorgl2.c b/lapack-netlib/SRC/sorgl2.c index d5dfcc95e7..1fb907dfc4 100644 --- a/lapack-netlib/SRC/sorgl2.c +++ b/lapack-netlib/SRC/sorgl2.c @@ -622,7 +622,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgl2_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorgl2_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -631,9 +631,10 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, - real *, integer *, real *), xerbla_(char *, integer *, ftnlen); + real *, integer *, real *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -668,13 +669,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGL2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -722,7 +723,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of SORGL2 */ diff --git a/lapack-netlib/SRC/sorglq.c b/lapack-netlib/SRC/sorglq.c index 7897879d34..ff865dcfa0 100644 --- a/lapack-netlib/SRC/sorglq.c +++ b/lapack-netlib/SRC/sorglq.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorglq_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorglq_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -651,15 +651,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int sorgl2_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgl2_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ib, nb, ki, kk, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -704,16 +705,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -832,7 +833,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SORGLQ */ diff --git a/lapack-netlib/SRC/sorgql.c b/lapack-netlib/SRC/sorgql.c index 31b454e21a..81ee726b10 100644 --- a/lapack-netlib/SRC/sorgql.c +++ b/lapack-netlib/SRC/sorgql.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgql_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorgql_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -652,15 +652,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo; - extern /* Subroutine */ int sorg2l_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorg2l_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ib, nb, kk, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -716,15 +717,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } nbmin = 2; @@ -842,7 +843,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SORGQL */ diff --git a/lapack-netlib/SRC/sorgqr.c b/lapack-netlib/SRC/sorgqr.c index c7ced9482c..208038f538 100644 --- a/lapack-netlib/SRC/sorgqr.c +++ b/lapack-netlib/SRC/sorgqr.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgqr_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorgqr_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -652,15 +652,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo, ib; - extern /* Subroutine */ int sorg2r_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorg2r_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *); integer nb, ki, kk, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -705,16 +706,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -833,7 +834,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SORGQR */ diff --git a/lapack-netlib/SRC/sorgr2.c b/lapack-netlib/SRC/sorgr2.c index fd44b2bcba..04b830eac1 100644 --- a/lapack-netlib/SRC/sorgr2.c +++ b/lapack-netlib/SRC/sorgr2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgr2_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorgr2_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *info) { /* System generated locals */ @@ -633,7 +633,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer ii; @@ -672,13 +672,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -724,7 +724,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of SORGR2 */ diff --git a/lapack-netlib/SRC/sorgrq.c b/lapack-netlib/SRC/sorgrq.c index 66d2433914..b32dd46065 100644 --- a/lapack-netlib/SRC/sorgrq.c +++ b/lapack-netlib/SRC/sorgrq.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgrq_(integer *m, integer *n, integer *k, real *a, +/* Subroutine */ void sorgrq_(integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -652,15 +652,16 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo, ib; - extern /* Subroutine */ int sorgr2_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgr2_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *); integer nb, ii, kk, nx; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -716,15 +717,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } nbmin = 2; @@ -842,7 +843,7 @@ f"> */ } work[1] = (real) iws; - return 0; + return; /* End of SORGRQ */ diff --git a/lapack-netlib/SRC/sorgtr.c b/lapack-netlib/SRC/sorgtr.c index e7ff6daaa8..1cc106671d 100644 --- a/lapack-netlib/SRC/sorgtr.c +++ b/lapack-netlib/SRC/sorgtr.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorgtr_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void sorgtr_(char *uplo, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -652,7 +652,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sorgql_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorgql_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *, integer *), sorgqr_( integer *, integer *, integer *, real *, integer *, real *, real * , integer *, integer *); @@ -719,16 +719,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGTR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.f; - return 0; + return; } if (upper) { @@ -799,7 +799,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORGTR */ diff --git a/lapack-netlib/SRC/sorgtsqr.c b/lapack-netlib/SRC/sorgtsqr.c index 5b63e99d7c..a9f8a30d6b 100644 --- a/lapack-netlib/SRC/sorgtsqr.c +++ b/lapack-netlib/SRC/sorgtsqr.c @@ -688,7 +688,7 @@ r.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int sorgtsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void sorgtsqr_(integer *m, integer *n, integer *mb, integer * nb, real *a, integer *lda, real *t, integer *ldt, real *work, integer *lwork, integer *info) { @@ -696,14 +696,15 @@ r.f"> */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int slamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void slamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); integer lworkopt, j, iinfo; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer lc, lw; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaset_( char *, integer *, integer *, real *, real *, real *, integer *); logical lquery; integer ldc, nblocal; @@ -784,17 +785,17 @@ r.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1] = (real) lworkopt; - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { work[1] = (real) lworkopt; - return 0; + return; } /* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in */ @@ -828,7 +829,7 @@ r.f"> */ } work[1] = (real) lworkopt; - return 0; + return; /* End of SORGTSQR */ diff --git a/lapack-netlib/SRC/sorgtsqr_row.c b/lapack-netlib/SRC/sorgtsqr_row.c index 3875372e21..cd966e9e5c 100644 --- a/lapack-netlib/SRC/sorgtsqr_row.c +++ b/lapack-netlib/SRC/sorgtsqr_row.c @@ -701,7 +701,7 @@ r_row.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sorgtsqr_row_(integer *m, integer *n, integer *mb, +/* Subroutine */ void sorgtsqr_row_(integer *m, integer *n, integer *mb, integer *nb, real *a, integer *lda, real *t, integer *ldt, real *work, integer *lwork, integer *info) { @@ -712,12 +712,13 @@ r_row.f"> */ integer jb_t__, itmp, lworkopt; real dummy[1] /* was [1][1] */; integer ib_bottom__, ib, kb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaset_( char *, integer *, integer *, real *, real *, real *, integer *); integer mb1, mb2, m_plus_one__; logical lquery; integer num_all_row_blocks__, imb, knb, nblocal, kb_last__; - extern /* Subroutine */ int slarfb_gett_(char *, integer *, integer *, + extern /* Subroutine */ void slarfb_gett_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *); @@ -779,17 +780,17 @@ r_row.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORGTSQR_ROW", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { work[1] = (real) lworkopt; - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { work[1] = (real) lworkopt; - return 0; + return; } /* (0) Set the upper-triangular part of the matrix A to zero and */ @@ -911,7 +912,7 @@ r_row.f"> */ } work[1] = (real) lworkopt; - return 0; + return; /* End of SORGTSQR_ROW */ diff --git a/lapack-netlib/SRC/sorhr_col.c b/lapack-netlib/SRC/sorhr_col.c index 706200dd08..d2110548ec 100644 --- a/lapack-netlib/SRC/sorhr_col.c +++ b/lapack-netlib/SRC/sorhr_col.c @@ -773,17 +773,17 @@ ol.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int sorhr_col_(integer *m, integer *n, integer *nb, real *a, +/* Subroutine */ void sorhr_col_(integer *m, integer *n, integer *nb, real *a, integer *lda, real *t, integer *ldt, real *d__, integer *info) { /* System generated locals */ integer a_dim1, a_offset, t_dim1, t_offset, i__1, i__2, i__3, i__4; /* Local variables */ - extern /* Subroutine */ int slaorhr_col_getrfnp_(integer *, integer *, + extern /* Subroutine */ void slaorhr_col_getrfnp_(integer *, integer *, real *, integer *, real *, integer *); integer nplusone, i__, j, iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), strsm_( char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); @@ -835,13 +835,13 @@ ol.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORHR_COL", &i__1, (ftnlen)9); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* On input, the M-by-N matrix A contains the orthogonal */ @@ -970,7 +970,7 @@ ol.f"> */ } - return 0; + return; /* End of SORHR_COL */ diff --git a/lapack-netlib/SRC/sorm22.c b/lapack-netlib/SRC/sorm22.c index 2cbc237afb..5f1d05cda1 100644 --- a/lapack-netlib/SRC/sorm22.c +++ b/lapack-netlib/SRC/sorm22.c @@ -675,7 +675,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorm22_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sorm22_(char *side, char *trans, integer *m, integer *n, integer *n1, integer *n2, real *q, integer *ldq, real *c__, integer * ldc, real *work, integer *lwork, integer *info) { @@ -686,13 +686,14 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); integer nb, nq, nw; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ); logical notran; @@ -770,16 +771,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORM22", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1] = 1.f; - return 0; + return; } /* Degenerate cases (N1 = 0 or N2 = 0) are handled using STRMM. */ @@ -788,12 +789,12 @@ f"> */ strmm_(side, "Upper", trans, "Non-Unit", m, n, &c_b10, &q[q_offset], ldq, &c__[c_offset], ldc); work[1] = 1.f; - return 0; + return; } else if (*n2 == 0) { strmm_(side, "Lower", trans, "Non-Unit", m, n, &c_b10, &q[q_offset], ldq, &c__[c_offset], ldc); work[1] = 1.f; - return 0; + return; } /* Compute the largest chunk size available from the workspace. */ @@ -978,7 +979,7 @@ f"> */ } work[1] = (real) lwkopt; - return 0; + return; /* End of SORM22 */ diff --git a/lapack-netlib/SRC/sorm2l.c b/lapack-netlib/SRC/sorm2l.c index b0ded3367b..97b6827082 100644 --- a/lapack-netlib/SRC/sorm2l.c +++ b/lapack-netlib/SRC/sorm2l.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorm2l_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sorm2l_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *info) { @@ -683,7 +683,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer i1, i2, i3, mi, ni, nq; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -742,13 +742,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORM2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -791,7 +791,7 @@ f"> */ a[nq - *k + i__ + i__ * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of SORM2L */ diff --git a/lapack-netlib/SRC/sorm2r.c b/lapack-netlib/SRC/sorm2r.c index 98136ee80f..4d8e075bf8 100644 --- a/lapack-netlib/SRC/sorm2r.c +++ b/lapack-netlib/SRC/sorm2r.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorm2r_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sorm2r_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *info) { @@ -683,7 +683,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer i1, i2, i3, ic, jc, mi, ni, nq; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -742,13 +742,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORM2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -795,7 +795,7 @@ f"> */ a[i__ + i__ * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of SORM2R */ diff --git a/lapack-netlib/SRC/sormbr.c b/lapack-netlib/SRC/sormbr.c index fc324c883c..701bf10d7f 100644 --- a/lapack-netlib/SRC/sormbr.c +++ b/lapack-netlib/SRC/sormbr.c @@ -710,7 +710,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormbr_(char *vect, char *side, char *trans, integer *m, +/* Subroutine */ void sormbr_(char *vect, char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -728,12 +728,12 @@ f"> */ integer *, integer *, ftnlen, ftnlen); logical notran, applyq; char transt[1]; - extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormlq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -848,16 +848,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ work[1] = 1.f; if (*m == 0 || *n == 0) { - return 0; + return; } if (applyq) { @@ -926,7 +926,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORMBR */ diff --git a/lapack-netlib/SRC/sormhr.c b/lapack-netlib/SRC/sormhr.c index 6893d310be..c9eb974440 100644 --- a/lapack-netlib/SRC/sormhr.c +++ b/lapack-netlib/SRC/sormhr.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormhr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormhr_(char *side, char *trans, integer *m, integer *n, integer *ilo, integer *ihi, real *a, integer *lda, real *tau, real * c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -711,7 +711,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -796,16 +796,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("SORMHR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nh == 0) { work[1] = 1.f; - return 0; + return; } if (left) { @@ -824,7 +824,7 @@ f"> */ tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); work[1] = (real) lwkopt; - return 0; + return; /* End of SORMHR */ diff --git a/lapack-netlib/SRC/sorml2.c b/lapack-netlib/SRC/sorml2.c index 47b6ce8402..122fe4ed66 100644 --- a/lapack-netlib/SRC/sorml2.c +++ b/lapack-netlib/SRC/sorml2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sorml2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sorml2_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *info) { @@ -679,7 +679,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer i1, i2, i3, ic, jc, mi, ni, nq; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORML2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -791,7 +791,7 @@ f"> */ a[i__ + i__ * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of SORML2 */ diff --git a/lapack-netlib/SRC/sormlq.c b/lapack-netlib/SRC/sormlq.c index ea469602a3..5b37bed560 100644 --- a/lapack-netlib/SRC/sormlq.c +++ b/lapack-netlib/SRC/sormlq.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormlq_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, ic, jc; - extern /* Subroutine */ int sorml2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorml2_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *); integer nb, mi, ni, nq, nw; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); logical notran; integer ldwork; @@ -790,16 +791,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -889,7 +890,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORMLQ */ diff --git a/lapack-netlib/SRC/sormql.c b/lapack-netlib/SRC/sormql.c index 3932e56c93..e729d49cd4 100644 --- a/lapack-netlib/SRC/sormql.c +++ b/lapack-netlib/SRC/sormql.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormql_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormql_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib; - extern /* Subroutine */ int sorm2l_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorm2l_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *); integer nb, mi, ni, nq, nw; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); logical notran; integer ldwork, lwkopt; @@ -792,15 +793,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -880,7 +881,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORMQL */ diff --git a/lapack-netlib/SRC/sormqr.c b/lapack-netlib/SRC/sormqr.c index 1531b31864..b621e1c7ab 100644 --- a/lapack-netlib/SRC/sormqr.c +++ b/lapack-netlib/SRC/sormqr.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormqr_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, ic, jc, nb; - extern /* Subroutine */ int sorm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorm2r_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *); integer mi, ni, nq, nw; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); logical notran; integer ldwork, lwkopt; @@ -788,16 +789,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1] = 1.f; - return 0; + return; } nbmin = 2; @@ -881,7 +882,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORMQR */ diff --git a/lapack-netlib/SRC/sormr2.c b/lapack-netlib/SRC/sormr2.c index 7cd260bd62..2b6d81978a 100644 --- a/lapack-netlib/SRC/sormr2.c +++ b/lapack-netlib/SRC/sormr2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormr2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormr2_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *info) { @@ -679,7 +679,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarf_(char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer i1, i2, i3, mi, ni, nq; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -787,7 +787,7 @@ f"> */ a[i__ + (nq - *k + i__) * a_dim1] = aii; /* L10: */ } - return 0; + return; /* End of SORMR2 */ diff --git a/lapack-netlib/SRC/sormr3.c b/lapack-netlib/SRC/sormr3.c index 93b6b758ff..8c806a145b 100644 --- a/lapack-netlib/SRC/sormr3.c +++ b/lapack-netlib/SRC/sormr3.c @@ -687,7 +687,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sormr3_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormr3_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *info) { @@ -699,7 +699,7 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer i1, i2, i3; - extern /* Subroutine */ int slarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void slarz_(char *, integer *, integer *, integer * , real *, integer *, real *, real *, integer *, real *); integer ja, ic, jc, mi, ni, nq; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -759,13 +759,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMR3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -813,7 +813,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of SORMR3 */ diff --git a/lapack-netlib/SRC/sormrq.c b/lapack-netlib/SRC/sormrq.c index 2c2d7084e8..a516c57ea5 100644 --- a/lapack-netlib/SRC/sormrq.c +++ b/lapack-netlib/SRC/sormrq.c @@ -683,7 +683,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormrq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormrq_(char *side, char *trans, integer *m, integer *n, integer *k, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -698,16 +698,17 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, nb; - extern /* Subroutine */ int sormr2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormr2_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *); integer mi, ni, nq, nw; - extern /* Subroutine */ int slarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarfb_(char *, char *, char *, char *, integer *, integer *, integer *, real *, integer *, real *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarft_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); logical notran; integer ldwork; @@ -794,15 +795,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -888,7 +889,7 @@ f"> */ } } work[1] = (real) lwkopt; - return 0; + return; /* End of SORMRQ */ diff --git a/lapack-netlib/SRC/sormrz.c b/lapack-netlib/SRC/sormrz.c index af254f9af1..e5614ac38c 100644 --- a/lapack-netlib/SRC/sormrz.c +++ b/lapack-netlib/SRC/sormrz.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sormrz_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void sormrz_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -717,20 +717,20 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, ic, ja, jc, nb; - extern /* Subroutine */ int sormr3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormr3_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , real *, integer *); integer mi, ni, nq, nw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarzb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarzb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *); logical notran; integer ldwork; char transt[1]; - extern /* Subroutine */ int slarzt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarzt_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *); integer lwkopt; logical lquery; @@ -816,15 +816,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SORMRZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -919,7 +919,7 @@ f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SORMRZ */ diff --git a/lapack-netlib/SRC/sormtr.c b/lapack-netlib/SRC/sormtr.c index 1ae186e518..d1a19ffa25 100644 --- a/lapack-netlib/SRC/sormtr.c +++ b/lapack-netlib/SRC/sormtr.c @@ -686,7 +686,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sormtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void sormtr_(char *side, char *uplo, char *trans, integer *m, integer *n, real *a, integer *lda, real *tau, real *c__, integer *ldc, real *work, integer *lwork, integer *info) { @@ -704,12 +704,12 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sormql_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormql_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -818,16 +818,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("SORMTR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nq == 1) { work[1] = 1.f; - return 0; + return; } if (left) { @@ -861,7 +861,7 @@ f"> */ c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); } work[1] = (real) lwkopt; - return 0; + return; /* End of SORMTR */ diff --git a/lapack-netlib/SRC/spbcon.c b/lapack-netlib/SRC/spbcon.c index df9f526931..3725448a33 100644 --- a/lapack-netlib/SRC/spbcon.c +++ b/lapack-netlib/SRC/spbcon.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spbcon_(char *uplo, integer *n, integer *kd, real *ab, +/* Subroutine */ void spbcon_(char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { @@ -657,9 +657,9 @@ f"> */ real scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ix; real scalel; @@ -668,7 +668,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); real ainvnm; - extern /* Subroutine */ int slatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatbs_(char *, char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *); char normin[1]; @@ -710,7 +710,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -718,9 +718,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -784,7 +784,7 @@ f"> */ L20: - return 0; + return; /* End of SPBCON */ diff --git a/lapack-netlib/SRC/spbequ.c b/lapack-netlib/SRC/spbequ.c index 9bd16a4848..80458825e0 100644 --- a/lapack-netlib/SRC/spbequ.c +++ b/lapack-netlib/SRC/spbequ.c @@ -638,7 +638,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spbequ_(char *uplo, integer *n, integer *kd, real *ab, +/* Subroutine */ void spbequ_(char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -685,7 +685,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -693,7 +693,7 @@ f"> */ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } if (upper) { @@ -730,7 +730,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L20: */ } @@ -749,7 +749,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of SPBEQU */ diff --git a/lapack-netlib/SRC/spbrfs.c b/lapack-netlib/SRC/spbrfs.c index b878f2fa61..fce4df080b 100644 --- a/lapack-netlib/SRC/spbrfs.c +++ b/lapack-netlib/SRC/spbrfs.c @@ -703,7 +703,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spbrfs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void spbrfs_(char *uplo, integer *n, integer *kd, integer * nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real * work, integer *iwork, integer *info) @@ -720,10 +720,10 @@ f"> */ real s; extern logical lsame_(char *, char *); integer isave[3], count; - extern /* Subroutine */ int ssbmv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssbmv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); @@ -733,7 +733,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int spbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void spbtrs_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *); real eps; @@ -790,7 +790,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -802,7 +802,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1009,7 +1009,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SPBRFS */ diff --git a/lapack-netlib/SRC/spbstf.c b/lapack-netlib/SRC/spbstf.c index 089c688140..f4636978b4 100644 --- a/lapack-netlib/SRC/spbstf.c +++ b/lapack-netlib/SRC/spbstf.c @@ -666,7 +666,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spbstf_(char *uplo, integer *n, integer *kd, real *ab, +/* Subroutine */ void spbstf_(char *uplo, integer *n, integer *kd, real *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -674,11 +674,11 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int ssyr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr_(char *, integer *, real *, real *, integer *, real *, integer *); integer j, m; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; integer km; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -717,13 +717,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBSTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -849,11 +849,11 @@ f"> */ /* L40: */ } } - return 0; + return; L50: *info = j; - return 0; + return; /* End of SPBSTF */ diff --git a/lapack-netlib/SRC/spbsv.c b/lapack-netlib/SRC/spbsv.c index 88dba3b752..1d76f9fe0d 100644 --- a/lapack-netlib/SRC/spbsv.c +++ b/lapack-netlib/SRC/spbsv.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spbsv_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void spbsv_(char *uplo, integer *n, integer *kd, integer * nrhs, real *ab, integer *ldab, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -681,7 +681,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spbtrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void spbtrf_( char *, integer *, integer *, real *, integer *, integer *), spbtrs_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -723,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SPBSV ", &i__1, (ftnlen)5); - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U or A = L*L**T. */ @@ -737,7 +738,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ info); } - return 0; + return; /* End of SPBSV */ diff --git a/lapack-netlib/SRC/spbsvx.c b/lapack-netlib/SRC/spbsvx.c index 179565cac1..1574cd6ba3 100644 --- a/lapack-netlib/SRC/spbsvx.c +++ b/lapack-netlib/SRC/spbsvx.c @@ -854,7 +854,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spbsvx_(char *fact, char *uplo, integer *n, integer *kd, +/* Subroutine */ void spbsvx_(char *fact, char *uplo, integer *n, integer *kd, integer *nrhs, real *ab, integer *ldab, real *afb, integer *ldafb, char *equed, real *s, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, integer *iwork, @@ -872,7 +872,7 @@ f"> */ real scond, anorm; logical equil, rcequ, upper; integer j1, j2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamch_(char *); logical nofact; @@ -880,19 +880,19 @@ f"> */ real bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int spbcon_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spbcon_(char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, integer *), slaqsb_(char *, integer *, integer *, real *, integer *, real *, real *, real *, char *); integer infequ; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), spbequ_(char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *), spbrfs_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, real *, real *, integer *, integer *), spbtrf_( char *, integer *, integer *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int spbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void spbtrs_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -991,7 +991,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1058,7 +1058,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -1110,7 +1110,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of SPBSVX */ diff --git a/lapack-netlib/SRC/spbtf2.c b/lapack-netlib/SRC/spbtf2.c index 778eec4d21..011fd91d0f 100644 --- a/lapack-netlib/SRC/spbtf2.c +++ b/lapack-netlib/SRC/spbtf2.c @@ -657,7 +657,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spbtf2_(char *uplo, integer *n, integer *kd, real *ab, +/* Subroutine */ void spbtf2_(char *uplo, integer *n, integer *kd, real *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -665,11 +665,11 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int ssyr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr_(char *, integer *, real *, real *, integer *, real *, integer *); integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; integer kn; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -708,13 +708,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -782,11 +782,11 @@ f"> */ /* L20: */ } } - return 0; + return; L30: *info = j; - return 0; + return; /* End of SPBTF2 */ diff --git a/lapack-netlib/SRC/spbtrf.c b/lapack-netlib/SRC/spbtrf.c index b6ee3b3459..cd4b615228 100644 --- a/lapack-netlib/SRC/spbtrf.c +++ b/lapack-netlib/SRC/spbtrf.c @@ -659,7 +659,7 @@ f"> */ /* > Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 */ /* ===================================================================== */ -/* Subroutine */ int spbtrf_(char *uplo, integer *n, integer *kd, real *ab, +/* Subroutine */ void spbtrf_(char *uplo, integer *n, integer *kd, real *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -669,11 +669,11 @@ f"> */ real work[1056] /* was [33][32] */; integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer i2, i3; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer * @@ -718,13 +718,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1016,10 +1016,10 @@ f"> */ } } } - return 0; + return; L150: - return 0; + return; /* End of SPBTRF */ diff --git a/lapack-netlib/SRC/spbtrs.c b/lapack-netlib/SRC/spbtrs.c index dc6e51f990..1dda793147 100644 --- a/lapack-netlib/SRC/spbtrs.c +++ b/lapack-netlib/SRC/spbtrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spbtrs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void spbtrs_(char *uplo, integer *n, integer *kd, integer * nrhs, real *ab, integer *ldab, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -644,8 +644,9 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int stbsv_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void stbsv_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -686,13 +687,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -733,7 +734,7 @@ f"> */ } } - return 0; + return; /* End of SPBTRS */ diff --git a/lapack-netlib/SRC/spftrf.c b/lapack-netlib/SRC/spftrf.c index c82254f073..464d480833 100644 --- a/lapack-netlib/SRC/spftrf.c +++ b/lapack-netlib/SRC/spftrf.c @@ -713,7 +713,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spftrf_(char *transr, char *uplo, integer *n, real *a, +/* Subroutine */ void spftrf_(char *transr, char *uplo, integer *n, real *a, integer *info) { /* System generated locals */ @@ -725,11 +725,12 @@ f"> */ extern logical lsame_(char *, char *); logical lower; integer n1, n2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int spotrf_(char *, integer *, real *, integer *, integer *); @@ -759,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPFTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -806,7 +807,7 @@ f"> */ spotrf_("L", &n1, a, n, info); if (*info > 0) { - return 0; + return; } strsm_("R", "L", "T", "N", &n2, &n1, &c_b12, a, n, &a[n1], n); ssyrk_("U", "N", &n2, &n1, &c_b15, &a[n1], n, &c_b12, &a[*n], @@ -824,7 +825,7 @@ f"> */ spotrf_("L", &n1, &a[n2], n, info); if (*info > 0) { - return 0; + return; } strsm_("L", "L", "N", "N", &n1, &n2, &c_b12, &a[n2], n, a, n); ssyrk_("U", "T", &n2, &n1, &c_b15, a, n, &c_b12, &a[n1], n); @@ -847,7 +848,7 @@ f"> */ spotrf_("U", &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } strsm_("L", "U", "T", "N", &n1, &n2, &c_b12, a, &n1, &a[n1 * n1], &n1); @@ -866,7 +867,7 @@ f"> */ spotrf_("U", &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } strsm_("R", "U", "N", "N", &n2, &n1, &c_b12, &a[n2 * n2], &n2, a, &n2); @@ -898,7 +899,7 @@ f"> */ i__1 = *n + 1; spotrf_("L", &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -923,7 +924,7 @@ f"> */ i__1 = *n + 1; spotrf_("L", &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -953,7 +954,7 @@ f"> */ spotrf_("U", &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } strsm_("L", "U", "T", "N", &k, &k, &c_b12, &a[k], &n1, &a[k * (k + 1)], &k); @@ -972,7 +973,7 @@ f"> */ spotrf_("U", &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } strsm_("R", "U", "N", "N", &k, &k, &c_b12, &a[k * (k + 1)], & k, a, &k); @@ -988,7 +989,7 @@ f"> */ } - return 0; + return; /* End of SPFTRF */ diff --git a/lapack-netlib/SRC/spftri.c b/lapack-netlib/SRC/spftri.c index daf9f28349..44a295333d 100644 --- a/lapack-netlib/SRC/spftri.c +++ b/lapack-netlib/SRC/spftri.c @@ -705,7 +705,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spftri_(char *transr, char *uplo, integer *n, real *a, +/* Subroutine */ void spftri_(char *transr, char *uplo, integer *n, real *a, integer *info) { /* System generated locals */ @@ -717,14 +717,16 @@ f"> */ extern logical lsame_(char *, char *); logical lower; integer n1, n2; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int slauum_(char *, integer *, real *, integer *, - integer *), stftri_(char *, char *, char *, integer *, + integer *); + extern void stftri_(char *, char *, char *, integer *, real *, integer *); @@ -752,20 +754,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ stftri_(transr, uplo, "N", n, a, info); if (*info > 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -940,7 +942,7 @@ f"> */ } - return 0; + return; /* End of SPFTRI */ diff --git a/lapack-netlib/SRC/spftrs.c b/lapack-netlib/SRC/spftrs.c index 47b690b056..b7cbf6c325 100644 --- a/lapack-netlib/SRC/spftrs.c +++ b/lapack-netlib/SRC/spftrs.c @@ -712,7 +712,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spftrs_(char *transr, char *uplo, integer *n, integer * +/* Subroutine */ void spftrs_(char *transr, char *uplo, integer *n, integer * nrhs, real *a, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -722,8 +722,9 @@ f"> */ logical normaltransr; extern logical lsame_(char *, char *); logical lower; - extern /* Subroutine */ int stfsm_(char *, char *, char *, char *, char *, - integer *, integer *, real *, real *, real *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void stfsm_(char *, char *, char *, char *, char *, + integer *, integer *, real *, real *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -760,13 +761,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPFTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* start execution: there are two triangular solves */ @@ -783,7 +784,7 @@ f"> */ ldb); } - return 0; + return; /* End of SPFTRS */ diff --git a/lapack-netlib/SRC/spocon.c b/lapack-netlib/SRC/spocon.c index aa58125817..4f4fc12cdd 100644 --- a/lapack-netlib/SRC/spocon.c +++ b/lapack-netlib/SRC/spocon.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spocon_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spocon_(char *uplo, integer *n, real *a, integer *lda, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { /* System generated locals */ @@ -645,9 +645,9 @@ f"> */ real scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ix; real scalel; @@ -657,7 +657,7 @@ f"> */ extern integer isamax_(integer *, real *, integer *); real ainvnm; char normin[1]; - extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatrs_(char *, char *, char *, char *, integer *, real *, integer *, real *, real *, real *, integer *); real smlnum; @@ -695,7 +695,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -703,9 +703,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -766,7 +766,7 @@ f"> */ } L20: - return 0; + return; /* End of SPOCON */ diff --git a/lapack-netlib/SRC/spoequ.c b/lapack-netlib/SRC/spoequ.c index 7c93dacf03..bb80ce8485 100644 --- a/lapack-netlib/SRC/spoequ.c +++ b/lapack-netlib/SRC/spoequ.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spoequ_(integer *n, real *a, integer *lda, real *s, real +/* Subroutine */ void spoequ_(integer *n, real *a, integer *lda, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -661,7 +661,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -669,7 +669,7 @@ f"> */ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } /* Find the minimum and maximum diagonal elements. */ @@ -697,7 +697,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L20: */ } @@ -716,7 +716,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of SPOEQU */ diff --git a/lapack-netlib/SRC/spoequb.c b/lapack-netlib/SRC/spoequb.c index aabb0e938e..3e616fa4a6 100644 --- a/lapack-netlib/SRC/spoequb.c +++ b/lapack-netlib/SRC/spoequb.c @@ -627,7 +627,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spoequb_(integer *n, real *a, integer *lda, real *s, +/* Subroutine */ void spoequb_(integer *n, real *a, integer *lda, real *s, real *scond, real *amax, integer *info) { /* System generated locals */ @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SPOEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -679,7 +679,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } base = slamch_("B"); tmp = -.5f / log(base); @@ -709,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L20: */ } @@ -730,7 +730,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of SPOEQUB */ diff --git a/lapack-netlib/SRC/sporfs.c b/lapack-netlib/SRC/sporfs.c index 76b7697dc8..6719762914 100644 --- a/lapack-netlib/SRC/sporfs.c +++ b/lapack-netlib/SRC/sporfs.c @@ -697,7 +697,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int sporfs_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sporfs_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *af, integer *ldaf, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) @@ -715,7 +715,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), slacn2_( @@ -727,7 +727,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int spotrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spotrs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *); real eps; @@ -782,7 +782,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPORFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -794,7 +794,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -991,7 +991,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SPORFS */ diff --git a/lapack-netlib/SRC/sporfsx.c b/lapack-netlib/SRC/sporfsx.c index 95dc57e845..adc3a90c59 100644 --- a/lapack-netlib/SRC/sporfsx.c +++ b/lapack-netlib/SRC/sporfsx.c @@ -803,7 +803,7 @@ static integer c__1 = 1; /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int sporfsx_(char *uplo, char *equed, integer *n, integer * +/* Subroutine */ void sporfsx_(char *uplo, char *equed, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, real *s, real * b, integer *ldb, real *x, integer *ldx, real *rcond, real *berr, integer *n_err_bnds__, real *err_bnds_norm__, real *err_bnds_comp__, @@ -818,7 +818,7 @@ static integer c__1 = 1; /* Local variables */ real illrcond_thresh__, unstable_thresh__; - extern /* Subroutine */ int sla_porfsx_extended_(integer *, char *, + extern /* Subroutine */ void sla_porfsx_extended_(integer *, char *, integer *, integer *, real *, integer *, real *, integer *, logical *, real *, real *, integer *, real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, real *, @@ -834,7 +834,8 @@ static integer c__1 = 1; real rcond_tmp__; integer prec_type__; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spocon_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void spocon_( char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); @@ -951,7 +952,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SPORFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -974,7 +975,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.f; } } - return 0; + return; } /* Default to failure. */ @@ -1113,7 +1114,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of SPORFSX */ diff --git a/lapack-netlib/SRC/sposv.c b/lapack-netlib/SRC/sposv.c index b5f80d5a9c..5feebe7366 100644 --- a/lapack-netlib/SRC/sposv.c +++ b/lapack-netlib/SRC/sposv.c @@ -639,7 +639,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realPOsolve */ /* ===================================================================== */ -/* Subroutine */ int sposv_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void sposv_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -647,8 +647,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spotrf_( - char *, integer *, real *, integer *, integer *), spotrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int spotrf_( + char *, integer *, real *, integer *, integer *); + extern void spotrs_( char *, integer *, integer *, real *, integer *, real *, integer * , integer *); @@ -688,7 +690,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SPOSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U or A = L*L**T. */ @@ -701,7 +703,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ spotrs_(uplo, n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); } - return 0; + return; /* End of SPOSV */ diff --git a/lapack-netlib/SRC/sposvx.c b/lapack-netlib/SRC/sposvx.c index 18ce544de5..5e845528bd 100644 --- a/lapack-netlib/SRC/sposvx.c +++ b/lapack-netlib/SRC/sposvx.c @@ -814,7 +814,7 @@ f"> */ /* > \ingroup realPOsolve */ /* ===================================================================== */ -/* Subroutine */ int sposvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void sposvx_(char *fact, char *uplo, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, char *equed, real *s, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, integer *iwork, integer *info) @@ -835,18 +835,20 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), spocon_(char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); real smlnum; - extern /* Subroutine */ int slaqsy_(char *, integer *, real *, integer *, + extern /* Subroutine */ void slaqsy_(char *, integer *, real *, integer *, real *, real *, real *, char *), spoequ_(integer * , real *, integer *, real *, real *, real *, integer *), sporfs_( char *, integer *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *, real *, real *, real *, - integer *, integer *), spotrf_(char *, integer *, real *, - integer *, integer *), spotrs_(char *, integer *, integer + integer *, integer *); + extern int spotrf_(char *, integer *, real *, + integer *, integer *); + extern void spotrs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -943,7 +945,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -985,7 +987,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -1036,7 +1038,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of SPOSVX */ diff --git a/lapack-netlib/SRC/sposvxx.c b/lapack-netlib/SRC/sposvxx.c index 4955b41ade..cea6dd3072 100644 --- a/lapack-netlib/SRC/sposvxx.c +++ b/lapack-netlib/SRC/sposvxx.c @@ -1003,7 +1003,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realPOsolve */ /* ===================================================================== */ -/* Subroutine */ int sposvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void sposvxx_(char *fact, char *uplo, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, char *equed, real *s, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, integer *n_err_bnds__, real * @@ -1029,10 +1029,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); real smlnum; - extern /* Subroutine */ int slaqsy_(char *, integer *, real *, integer *, + extern /* Subroutine */ void slaqsy_(char *, integer *, real *, integer *, real *, real *, real *, char *), spotrf_(char *, integer *, real *, integer *, integer *), spotrs_(char *, integer *, integer *, real *, integer *, real *, integer *, @@ -1149,7 +1149,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SPOSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1189,7 +1189,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = sla_porpvgrw_(uplo, info, &a[a_offset], lda, &af[ af_offset], ldaf, &work[1]); - return 0; + return; } } @@ -1218,7 +1218,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ slascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of SPOSVXX */ diff --git a/lapack-netlib/SRC/spotf2.c b/lapack-netlib/SRC/spotf2.c index 420af52aac..f0f1f12f9a 100644 --- a/lapack-netlib/SRC/spotf2.c +++ b/lapack-netlib/SRC/spotf2.c @@ -625,7 +625,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spotf2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spotf2_(char *uplo, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -636,7 +636,7 @@ f"> */ extern real sdot_(integer *, real *, integer *, real *, integer *); integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; @@ -674,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -756,7 +756,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of SPOTF2 */ diff --git a/lapack-netlib/SRC/spotrf.c b/lapack-netlib/SRC/spotrf.c index 6253877e15..3b4933918b 100644 --- a/lapack-netlib/SRC/spotrf.c +++ b/lapack-netlib/SRC/spotrf.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spotrf_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spotrf_(char *uplo, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -632,11 +632,11 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer * @@ -645,7 +645,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int spotrf2_(char *, integer *, real *, integer *, + extern /* Subroutine */ void spotrf2_(char *, integer *, real *, integer *, integer *); @@ -678,13 +678,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -784,7 +784,7 @@ f"> */ *info = *info + j - 1; L40: - return 0; + return; /* End of SPOTRF */ diff --git a/lapack-netlib/SRC/spotrf2.c b/lapack-netlib/SRC/spotrf2.c index c32dcd7bee..b81ff99715 100644 --- a/lapack-netlib/SRC/spotrf2.c +++ b/lapack-netlib/SRC/spotrf2.c @@ -617,7 +617,7 @@ static real c_b11 = -1.f; /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spotrf2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spotrf2_(char *uplo, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -628,11 +628,12 @@ static real c_b11 = -1.f; integer iinfo; logical upper; integer n1, n2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); extern logical sisnan_(real *); @@ -665,13 +666,13 @@ static real c_b11 = -1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SPOTRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* N=1 case */ @@ -682,7 +683,7 @@ static real c_b11 = -1.f; if (a[a_dim1 + 1] <= 0.f || sisnan_(&a[a_dim1 + 1])) { *info = 1; - return 0; + return; } /* Factor */ @@ -700,7 +701,7 @@ static real c_b11 = -1.f; spotrf2_(uplo, &n1, &a[a_dim1 + 1], lda, &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U */ @@ -719,7 +720,7 @@ static real c_b11 = -1.f; spotrf2_(uplo, &n2, &a[n1 + 1 + (n1 + 1) * a_dim1], lda, &iinfo); if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } /* Compute the Cholesky factorization A = L*L**T */ @@ -738,11 +739,11 @@ static real c_b11 = -1.f; spotrf2_(uplo, &n2, &a[n1 + 1 + (n1 + 1) * a_dim1], lda, &iinfo); if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } } } - return 0; + return; /* End of SPOTRF2 */ diff --git a/lapack-netlib/SRC/spotri.c b/lapack-netlib/SRC/spotri.c index 7011b0683d..d32d013d32 100644 --- a/lapack-netlib/SRC/spotri.c +++ b/lapack-netlib/SRC/spotri.c @@ -604,7 +604,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spotri_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spotri_(char *uplo, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -612,8 +612,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slauum_( - char *, integer *, real *, integer *, integer *), strtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int slauum_( + char *, integer *, real *, integer *, integer *); + extern int strtri_( char *, char *, integer *, real *, integer *, integer *); @@ -645,27 +647,27 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ strtri_(uplo, "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } /* Form inv(U) * inv(U)**T or inv(L)**T * inv(L). */ slauum_(uplo, n, &a[a_offset], lda, info); - return 0; + return; /* End of SPOTRI */ diff --git a/lapack-netlib/SRC/spotrs.c b/lapack-netlib/SRC/spotrs.c index 6b5f349b17..796eab2115 100644 --- a/lapack-netlib/SRC/spotrs.c +++ b/lapack-netlib/SRC/spotrs.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup realPOcomputational */ /* ===================================================================== */ -/* Subroutine */ int spotrs_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void spotrs_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -632,9 +632,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -673,13 +674,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPOTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -710,7 +711,7 @@ f"> */ a_offset], lda, &b[b_offset], ldb); } - return 0; + return; /* End of SPOTRS */ diff --git a/lapack-netlib/SRC/sppcon.c b/lapack-netlib/SRC/sppcon.c index c48fc9d4d6..07bd6a83df 100644 --- a/lapack-netlib/SRC/sppcon.c +++ b/lapack-netlib/SRC/sppcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sppcon_(char *uplo, integer *n, real *ap, real *anorm, +/* Subroutine */ void sppcon_(char *uplo, integer *n, real *ap, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { /* System generated locals */ @@ -643,9 +643,9 @@ f"> */ real scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ix; real scalel; @@ -655,7 +655,7 @@ f"> */ extern integer isamax_(integer *, real *, integer *); real ainvnm; char normin[1]; - extern /* Subroutine */ int slatps_(char *, char *, char *, char *, + extern /* Subroutine */ void slatps_(char *, char *, char *, char *, integer *, real *, real *, real *, real *, integer *); real smlnum; @@ -689,7 +689,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -697,9 +697,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } smlnum = slamch_("Safe minimum"); @@ -758,7 +758,7 @@ f"> */ } L20: - return 0; + return; /* End of SPPCON */ diff --git a/lapack-netlib/SRC/sppequ.c b/lapack-netlib/SRC/sppequ.c index 0523dadc16..102678c913 100644 --- a/lapack-netlib/SRC/sppequ.c +++ b/lapack-netlib/SRC/sppequ.c @@ -625,7 +625,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sppequ_(char *uplo, integer *n, real *ap, real *s, real * +/* Subroutine */ void sppequ_(char *uplo, integer *n, real *ap, real *s, real * scond, real *amax, integer *info) { /* System generated locals */ @@ -667,7 +667,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -675,7 +675,7 @@ f"> */ if (*n == 0) { *scond = 1.f; *amax = 0.f; - return 0; + return; } /* Initialize SMIN and AMAX. */ @@ -731,7 +731,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.f) { *info = i__; - return 0; + return; } /* L30: */ } @@ -750,7 +750,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of SPPEQU */ diff --git a/lapack-netlib/SRC/spprfs.c b/lapack-netlib/SRC/spprfs.c index 39f408fb4c..962506818c 100644 --- a/lapack-netlib/SRC/spprfs.c +++ b/lapack-netlib/SRC/spprfs.c @@ -685,7 +685,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spprfs_(char *uplo, integer *n, integer *nrhs, real *ap, +/* Subroutine */ void spprfs_(char *uplo, integer *n, integer *nrhs, real *ap, real *afp, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) { @@ -701,7 +701,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *), slacn2_(integer *, @@ -713,7 +713,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int spptrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spptrs_(char *, integer *, integer *, real *, real *, integer *, integer *); real eps; @@ -760,7 +760,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -772,7 +772,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -973,7 +973,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SPPRFS */ diff --git a/lapack-netlib/SRC/sppsv.c b/lapack-netlib/SRC/sppsv.c index 5ab64f20c9..724f8ed877 100644 --- a/lapack-netlib/SRC/sppsv.c +++ b/lapack-netlib/SRC/sppsv.c @@ -653,7 +653,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sppsv_(char *uplo, integer *n, integer *nrhs, real *ap, +/* Subroutine */ void sppsv_(char *uplo, integer *n, integer *nrhs, real *ap, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -661,7 +661,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void spptrf_( char *, integer *, real *, integer *), spptrs_(char *, integer *, integer *, real *, real *, integer *, integer *); @@ -697,7 +698,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SPPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**T*U or A = L*L**T. */ @@ -710,7 +711,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ spptrs_(uplo, n, nrhs, &ap[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of SPPSV */ diff --git a/lapack-netlib/SRC/sppsvx.c b/lapack-netlib/SRC/sppsvx.c index 2aacbd3f96..6a6eb27a39 100644 --- a/lapack-netlib/SRC/sppsvx.c +++ b/lapack-netlib/SRC/sppsvx.c @@ -823,7 +823,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sppsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void sppsvx_(char *fact, char *uplo, integer *n, integer * nrhs, real *ap, real *afp, char *equed, real *s, real *b, integer * ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, integer *iwork, integer *info) @@ -838,22 +838,22 @@ f"> */ extern logical lsame_(char *, char *); real scond, anorm; logical equil, rcequ; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamch_(char *); logical nofact; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); extern real slansp_(char *, char *, integer *, real *, real *); - extern /* Subroutine */ int sppcon_(char *, integer *, real *, real *, + extern /* Subroutine */ void sppcon_(char *, integer *, real *, real *, real *, real *, integer *, integer *), slaqsp_(char *, integer *, real *, real *, real *, real *, char *) ; real smlnum; - extern /* Subroutine */ int sppequ_(char *, integer *, real *, real *, + extern /* Subroutine */ void sppequ_(char *, integer *, real *, real *, real *, real *, integer *), spprfs_(char *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *), spptrf_( @@ -946,7 +946,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -989,7 +989,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -1038,7 +1038,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of SPPSVX */ diff --git a/lapack-netlib/SRC/spptrf.c b/lapack-netlib/SRC/spptrf.c index 9f5013ec06..a135b56338 100644 --- a/lapack-netlib/SRC/spptrf.c +++ b/lapack-netlib/SRC/spptrf.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int spptrf_(char *uplo, integer *n, real *ap, integer *info) +/* Subroutine */ void spptrf_(char *uplo, integer *n, real *ap, integer *info) { /* System generated locals */ integer i__1, i__2; @@ -641,13 +641,13 @@ f"> */ /* Local variables */ extern real sdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, + extern /* Subroutine */ void sspr_(char *, integer *, real *, real *, integer *, real *); integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int stpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpsv_(char *, char *, char *, integer *, real *, real *, integer *); integer jc, jj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -679,13 +679,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -756,7 +756,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of SPPTRF */ diff --git a/lapack-netlib/SRC/spptri.c b/lapack-netlib/SRC/spptri.c index 36e80f105d..28e57e7eb5 100644 --- a/lapack-netlib/SRC/spptri.c +++ b/lapack-netlib/SRC/spptri.c @@ -607,23 +607,24 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spptri_(char *uplo, integer *n, real *ap, integer *info) +/* Subroutine */ void spptri_(char *uplo, integer *n, real *ap, integer *info) { /* System generated locals */ integer i__1, i__2; /* Local variables */ extern real sdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, + extern /* Subroutine */ void sspr_(char *, integer *, real *, real *, integer *, real *); integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpmv_(char *, char *, char *, integer *, real *, real *, integer *); integer jc, jj; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), stptri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stptri_( char *, char *, integer *, real *, integer *); real ajj; integer jjn; @@ -654,20 +655,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ stptri_(uplo, "Non-unit", n, &ap[1], info); if (*info > 0) { - return 0; + return; } if (upper) { @@ -708,7 +709,7 @@ f"> */ } } - return 0; + return; /* End of SPPTRI */ diff --git a/lapack-netlib/SRC/spptrs.c b/lapack-netlib/SRC/spptrs.c index 19acfa4d5a..37ceb99c35 100644 --- a/lapack-netlib/SRC/spptrs.c +++ b/lapack-netlib/SRC/spptrs.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spptrs_(char *uplo, integer *n, integer *nrhs, real *ap, +/* Subroutine */ void spptrs_(char *uplo, integer *n, integer *nrhs, real *ap, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,8 +631,9 @@ f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int stpsv_(char *, char *, char *, integer *, - real *, real *, integer *), xerbla_(char * , integer *, ftnlen); + extern /* Subroutine */ void stpsv_(char *, char *, char *, integer *, + real *, real *, integer *); + extern int xerbla_(char * , integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -667,13 +668,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -714,7 +715,7 @@ f"> */ } } - return 0; + return; /* End of SPPTRS */ diff --git a/lapack-netlib/SRC/spstf2.c b/lapack-netlib/SRC/spstf2.c index c6ad41eb62..f408568b9a 100644 --- a/lapack-netlib/SRC/spstf2.c +++ b/lapack-netlib/SRC/spstf2.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spstf2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spstf2_(char *uplo, integer *n, real *a, integer *lda, integer *piv, integer *rank, real *tol, real *work, integer *info) { /* System generated locals */ @@ -669,13 +669,13 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer itemp; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real stemp; logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real sstop; extern real slamch_(char *); @@ -716,13 +716,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPSTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize PIV */ @@ -945,7 +945,7 @@ f"> */ *info = 1; L170: - return 0; + return; /* End of SPSTF2 */ diff --git a/lapack-netlib/SRC/spstrf.c b/lapack-netlib/SRC/spstrf.c index 13d2c041fa..5d446aa95d 100644 --- a/lapack-netlib/SRC/spstrf.c +++ b/lapack-netlib/SRC/spstrf.c @@ -659,7 +659,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spstrf_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void spstrf_(char *uplo, integer *n, real *a, integer *lda, integer *piv, integer *rank, real *tol, real *work, integer *info) { /* System generated locals */ @@ -670,19 +670,19 @@ f"> */ integer i__, j, k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer itemp; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real stemp; logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real sstop; - extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer *); integer jb, nb; - extern /* Subroutine */ int spstf2_(char *, integer *, real *, integer *, + extern /* Subroutine */ void spstf2_(char *, integer *, real *, integer *, integer *, integer *, real *, real *, integer *); extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -724,13 +724,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPSTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get block size */ @@ -1022,7 +1022,7 @@ f"> */ *info = 1; L200: - return 0; + return; /* End of SPSTRF */ diff --git a/lapack-netlib/SRC/sptcon.c b/lapack-netlib/SRC/sptcon.c index 3bc0862961..c8cb73217e 100644 --- a/lapack-netlib/SRC/sptcon.c +++ b/lapack-netlib/SRC/sptcon.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sptcon_(integer *n, real *d__, real *e, real *anorm, +/* Subroutine */ void sptcon_(integer *n, real *d__, real *e, real *anorm, real *rcond, real *work, integer *info) { /* System generated locals */ @@ -671,7 +671,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -679,9 +679,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm == 0.f) { - return 0; + return; } /* Check that D(1:N) is positive. */ @@ -689,7 +689,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (d__[i__] <= 0.f) { - return 0; + return; } /* L10: */ } @@ -730,7 +730,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of SPTCON */ diff --git a/lapack-netlib/SRC/spteqr.c b/lapack-netlib/SRC/spteqr.c index 5453f9d684..9c56d6d88e 100644 --- a/lapack-netlib/SRC/spteqr.c +++ b/lapack-netlib/SRC/spteqr.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup realPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int spteqr_(char *compz, integer *n, real *d__, real *e, +/* Subroutine */ void spteqr_(char *compz, integer *n, real *d__, real *e, real *z__, integer *ldz, real *work, integer *info) { /* System generated locals */ @@ -672,12 +672,13 @@ f"> */ integer i__; extern logical lsame_(char *, char *); real vt[1] /* was [1][1] */; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaset_( char *, integer *, integer *, real *, real *, real *, integer *), sbdsqr_(char *, integer *, integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, real *, integer *); integer icompz; - extern /* Subroutine */ int spttrf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void spttrf_(integer *, real *, real *, integer *); integer nru; @@ -722,20 +723,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz > 0) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } if (icompz == 2) { slaset_("Full", n, n, &c_b7, &c_b8, &z__[z_offset], ldz); @@ -745,7 +746,7 @@ f"> */ spttrf_(n, &d__[1], &e[1], info); if (*info != 0) { - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -781,7 +782,7 @@ f"> */ *info = *n + *info; } - return 0; + return; /* End of SPTEQR */ diff --git a/lapack-netlib/SRC/sptrfs.c b/lapack-netlib/SRC/sptrfs.c index 434c240ce5..46d142c807 100644 --- a/lapack-netlib/SRC/sptrfs.c +++ b/lapack-netlib/SRC/sptrfs.c @@ -676,7 +676,7 @@ f"> */ /* > \ingroup realPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sptrfs_(integer *n, integer *nrhs, real *d__, real *e, +/* Subroutine */ void sptrfs_(integer *n, integer *nrhs, real *d__, real *e, real *df, real *ef, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *info) { @@ -689,7 +689,7 @@ f"> */ integer i__, j; real s; integer count; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); real bi, cx, dx, ex; integer ix; @@ -699,7 +699,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer isamax_(integer *, real *, integer *); real lstres; - extern /* Subroutine */ int spttrs_(integer *, integer *, real *, real *, + extern /* Subroutine */ void spttrs_(integer *, integer *, real *, real *, real *, integer *, integer *); real eps; @@ -744,7 +744,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -756,7 +756,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -935,7 +935,7 @@ f"> */ /* L90: */ } - return 0; + return; /* End of SPTRFS */ diff --git a/lapack-netlib/SRC/sptsv.c b/lapack-netlib/SRC/sptsv.c index 8a5b346a57..36c454a938 100644 --- a/lapack-netlib/SRC/sptsv.c +++ b/lapack-netlib/SRC/sptsv.c @@ -623,14 +623,15 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realPTsolve */ /* ===================================================================== */ -/* Subroutine */ int sptsv_(integer *n, integer *nrhs, real *d__, real *e, +/* Subroutine */ void sptsv_(integer *n, integer *nrhs, real *d__, real *e, real *b, integer *ldb, integer *info) { /* System generated locals */ integer b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spttrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void spttrf_( integer *, real *, real *, integer *), spttrs_(integer *, integer *, real *, real *, real *, integer *, integer *); @@ -665,7 +666,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SPTSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the L*D*L**T (or U**T*D*U) factorization of A. */ @@ -677,7 +678,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ spttrs_(n, nrhs, &d__[1], &e[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of SPTSV */ diff --git a/lapack-netlib/SRC/sptsvx.c b/lapack-netlib/SRC/sptsvx.c index 5e32eb0adc..d2ab81b5ce 100644 --- a/lapack-netlib/SRC/sptsvx.c +++ b/lapack-netlib/SRC/sptsvx.c @@ -740,7 +740,7 @@ f"> */ /* > \ingroup realPTsolve */ /* ===================================================================== */ -/* Subroutine */ int sptsvx_(char *fact, integer *n, integer *nrhs, real *d__, +/* Subroutine */ void sptsvx_(char *fact, integer *n, integer *nrhs, real *d__, real *e, real *df, real *ef, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, integer *info) { @@ -750,15 +750,16 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ); extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int sptcon_(integer *, real *, real *, real *, + extern /* Subroutine */ void sptcon_(integer *, real *, real *, real *, real *, real *, integer *), sptrfs_(integer *, integer *, real *, real *, real *, real *, real *, integer *, real *, integer *, real *, real *, real *, integer *), spttrf_(integer *, real *, @@ -809,7 +810,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -827,7 +828,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -856,7 +857,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of SPTSVX */ diff --git a/lapack-netlib/SRC/spttrf.c b/lapack-netlib/SRC/spttrf.c index b657aafd62..916eb113c9 100644 --- a/lapack-netlib/SRC/spttrf.c +++ b/lapack-netlib/SRC/spttrf.c @@ -600,7 +600,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int spttrf_(integer *n, real *d__, real *e, integer *info) +/* Subroutine */ void spttrf_(integer *n, real *d__, real *e, integer *info) { /* System generated locals */ integer i__1; @@ -632,13 +632,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("SPTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the L*D*L**T (or U**T*D*U) factorization of A. */ @@ -715,7 +715,7 @@ f"> */ } L30: - return 0; + return; /* End of SPTTRF */ diff --git a/lapack-netlib/SRC/spttrs.c b/lapack-netlib/SRC/spttrs.c index 9c703e1ab9..cbd6424462 100644 --- a/lapack-netlib/SRC/spttrs.c +++ b/lapack-netlib/SRC/spttrs.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup realPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int spttrs_(integer *n, integer *nrhs, real *d__, real *e, +/* Subroutine */ void spttrs_(integer *n, integer *nrhs, real *d__, real *e, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,8 +631,9 @@ f"> */ /* Local variables */ integer j, jb, nb; - extern /* Subroutine */ int sptts2_(integer *, integer *, real *, real *, - real *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void sptts2_(integer *, integer *, real *, real *, + real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -667,13 +668,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SPTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Determine the number of right-hand sides to solve at a time. */ @@ -701,7 +702,7 @@ f"> */ } } - return 0; + return; /* End of SPTTRS */ diff --git a/lapack-netlib/SRC/sptts2.c b/lapack-netlib/SRC/sptts2.c index 38e2dde20e..be6b38ebed 100644 --- a/lapack-netlib/SRC/sptts2.c +++ b/lapack-netlib/SRC/sptts2.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup realPTcomputational */ /* ===================================================================== */ -/* Subroutine */ int sptts2_(integer *n, integer *nrhs, real *d__, real *e, +/* Subroutine */ void sptts2_(integer *n, integer *nrhs, real *d__, real *e, real *b, integer *ldb) { /* System generated locals */ @@ -621,7 +621,7 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -648,7 +648,7 @@ f"> */ r__1 = 1.f / d__[1]; sscal_(nrhs, &r__1, &b[b_offset], ldb); } - return 0; + return; } /* Solve A * X = B using the factorization A = L*D*L**T, */ @@ -676,7 +676,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of SPTTS2 */ diff --git a/lapack-netlib/SRC/srscl.c b/lapack-netlib/SRC/srscl.c index e91a646f7f..0ac2e4c367 100644 --- a/lapack-netlib/SRC/srscl.c +++ b/lapack-netlib/SRC/srscl.c @@ -593,12 +593,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int srscl_(integer *n, real *sa, real *sx, integer *incx) +/* Subroutine */ void srscl_(integer *n, real *sa, real *sx, integer *incx) { real cden; logical done; real cnum, cden1, cnum1; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), slabad_(real *, real *); extern real slamch_(char *); real bignum, smlnum, mul; @@ -620,7 +620,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Function Body */ if (*n <= 0) { - return 0; + return; } /* Get machine parameters */ @@ -667,7 +667,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ goto L10; } - return 0; + return; /* End of SRSCL */ diff --git a/lapack-netlib/SRC/ssb2st_kernels.c b/lapack-netlib/SRC/ssb2st_kernels.c index 96333110f5..b043b3cf66 100644 --- a/lapack-netlib/SRC/ssb2st_kernels.c +++ b/lapack-netlib/SRC/ssb2st_kernels.c @@ -683,7 +683,7 @@ kernels.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssb2st_kernels_(char *uplo, logical *wantz, integer * +/* Subroutine */ void ssb2st_kernels_(char *uplo, logical *wantz, integer * ttype, integer *st, integer *ed, integer *sweep, integer *n, integer * nb, integer *ib, real *a, integer *lda, real *v, real *tau, integer * ldvt, real *work) @@ -698,10 +698,10 @@ kernels.f"> */ extern logical lsame_(char *, char *); logical upper; integer j1, j2, lm, ln, ajeter; - extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarfg_(integer *, real *, real *, integer *, real *); integer ofdpos; - extern /* Subroutine */ int slarfx_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slarfx_(char *, integer *, integer *, real *, real *, real *, integer *, real *), slarfy_(char *, integer *, real *, integer *, real *, real *, integer *, real *); integer taupos; @@ -899,7 +899,7 @@ kernels.f"> */ } } - return 0; + return; /* END OF SSB2ST_KERNELS */ diff --git a/lapack-netlib/SRC/ssbev.c b/lapack-netlib/SRC/ssbev.c index dd3b39f5e2..f5f9dda6be 100644 --- a/lapack-netlib/SRC/ssbev.c +++ b/lapack-netlib/SRC/ssbev.c @@ -660,7 +660,7 @@ atrices */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int ssbev_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void ssbev_(char *jobz, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *w, real *z__, integer *ldz, real *work, integer *info) { @@ -675,7 +675,7 @@ atrices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical lower, wantz; integer iscale; extern real slamch_(char *); @@ -684,15 +684,15 @@ atrices */ real bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int ssbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssbtrd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); real smlnum; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); real eps; @@ -740,13 +740,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -758,7 +758,7 @@ atrices */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -819,7 +819,7 @@ atrices */ sscal_(&imax, &r__1, &w[1], &c__1); } - return 0; + return; /* End of SSBEV */ diff --git a/lapack-netlib/SRC/ssbev_2stage.c b/lapack-netlib/SRC/ssbev_2stage.c index c8cf4a0e27..4d9f67a778 100644 --- a/lapack-netlib/SRC/ssbev_2stage.c +++ b/lapack-netlib/SRC/ssbev_2stage.c @@ -722,7 +722,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssbev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void ssbev_2stage_(char *jobz, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *w, real *z__, integer * ldz, real *work, integer *lwork, integer *info) { @@ -737,13 +737,13 @@ stage.f"> */ real anrm; integer imax; real rmin, rmax; - extern /* Subroutine */ int ssytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void ssytrd_sb2st_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *); real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lhtrd, lwmin; logical lower; integer lwtrd; @@ -755,14 +755,14 @@ stage.f"> */ real bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer llwork; real smlnum; logical lquery; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); real eps; integer indhous; @@ -833,15 +833,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -853,7 +853,7 @@ stage.f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -921,7 +921,7 @@ stage.f"> */ work[1] = (real) lwmin; - return 0; + return; /* End of SSBEV_2STAGE */ diff --git a/lapack-netlib/SRC/ssbevd.c b/lapack-netlib/SRC/ssbevd.c index 2ef771e343..7f4bff87b3 100644 --- a/lapack-netlib/SRC/ssbevd.c +++ b/lapack-netlib/SRC/ssbevd.c @@ -708,7 +708,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int ssbevd_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void ssbevd_(char *jobz, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *w, real *z__, integer *ldz, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -721,7 +721,7 @@ f"> */ real anrm, rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lwmin; @@ -733,12 +733,12 @@ f"> */ real bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int ssbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssbtrd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *), ssterf_(integer *, real *, real *, integer *); @@ -817,15 +817,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -833,7 +833,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -896,7 +896,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSBEVD */ diff --git a/lapack-netlib/SRC/ssbevd_2stage.c b/lapack-netlib/SRC/ssbevd_2stage.c index 2dc23d188f..af09f95d26 100644 --- a/lapack-netlib/SRC/ssbevd_2stage.c +++ b/lapack-netlib/SRC/ssbevd_2stage.c @@ -753,7 +753,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssbevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void ssbevd_2stage_(char *jobz, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *w, real *z__, integer * ldz, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -767,13 +767,13 @@ static integer c__1 = 1; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); real anrm, rmin, rmax; - extern /* Subroutine */ int ssytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void ssytrd_sb2st_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *); real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lhtrd, lwmin; @@ -787,12 +787,12 @@ static integer c__1 = 1; real bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer llwork; real smlnum; logical lquery; @@ -876,15 +876,15 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SSBEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -892,7 +892,7 @@ static integer c__1 = 1; if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -958,7 +958,7 @@ static integer c__1 = 1; work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSBEVD_2STAGE */ diff --git a/lapack-netlib/SRC/ssbevx.c b/lapack-netlib/SRC/ssbevx.c index 057da2e2c4..d16816e56d 100644 --- a/lapack-netlib/SRC/ssbevx.c +++ b/lapack-netlib/SRC/ssbevx.c @@ -779,7 +779,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int ssbevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void ssbevx_(char *jobz, char *range, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *q, integer *ldq, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real * w, real *z__, integer *ldz, real *work, integer *iwork, integer * @@ -800,12 +800,12 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical lower; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -819,13 +819,13 @@ f"> */ real abstll, bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indisp, indiwo; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk; - extern /* Subroutine */ int ssbtrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssbtrd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *), sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real * @@ -833,7 +833,7 @@ f"> */ real *, integer *); integer nsplit; real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), ssteqr_(char *, integer *, real *, @@ -911,14 +911,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -939,7 +939,7 @@ f"> */ z__[z_dim1 + 1] = 1.f; } } - return 0; + return; } /* Get machine constants. */ @@ -1114,7 +1114,7 @@ f"> */ } } - return 0; + return; /* End of SSBEVX */ diff --git a/lapack-netlib/SRC/ssbevx_2stage.c b/lapack-netlib/SRC/ssbevx_2stage.c index 0fafba3a73..e93f636502 100644 --- a/lapack-netlib/SRC/ssbevx_2stage.c +++ b/lapack-netlib/SRC/ssbevx_2stage.c @@ -840,7 +840,7 @@ static real c_b45 = 0.f; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssbevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void ssbevx_2stage_(char *jobz, char *range, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *q, integer * ldq, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, real *work, integer * @@ -859,22 +859,22 @@ static real c_b45 = 0.f; integer imax; real rmin, rmax; logical test; - extern /* Subroutine */ int ssytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void ssytrd_sb2st_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *); integer itmp1, i__, j, indee; real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer lhtrd; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -888,24 +888,24 @@ static real c_b45 = 0.f; real abstll, bignum; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indisp, indiwo; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk; - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit, llwork; real smlnum; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); real eps, vll, vuu; integer indhous; @@ -1004,16 +1004,16 @@ static real c_b45 = 0.f; if (*info != 0) { i__1 = -(*info); xerbla_("SSBEVX_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1034,7 +1034,7 @@ static real c_b45 = 0.f; z__[z_dim1 + 1] = 1.f; } } - return 0; + return; } /* Get machine constants. */ @@ -1217,7 +1217,7 @@ static real c_b45 = 0.f; work[1] = (real) lwmin; - return 0; + return; /* End of SSBEVX_2STAGE */ diff --git a/lapack-netlib/SRC/ssbgst.c b/lapack-netlib/SRC/ssbgst.c index da4529a116..c1dea63dbf 100644 --- a/lapack-netlib/SRC/ssbgst.c +++ b/lapack-netlib/SRC/ssbgst.c @@ -674,7 +674,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssbgst_(char *vect, char *uplo, integer *n, integer *ka, +/* Subroutine */ void ssbgst_(char *vect, char *uplo, integer *n, integer *ka, integer *kb, real *ab, integer *ldab, real *bb, integer *ldbb, real * x, integer *ldx, real *work, integer *info) { @@ -685,31 +685,31 @@ f"> */ /* Local variables */ integer inca; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *), srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer i__, j, k, l, m; real t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer i0, i1; logical upper; integer i2, j1, j2; logical wantx; - extern /* Subroutine */ int slar2v_(integer *, real *, real *, real *, + extern /* Subroutine */ void slar2v_(integer *, real *, real *, real *, integer *, real *, real *, integer *); real ra; integer nr, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical update; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slartg_(real *, real *, real * , real *, real *); integer ka1, kb1; - extern /* Subroutine */ int slargv_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slargv_(integer *, real *, integer *, real *, integer *, real *, integer *); real ra1; - extern /* Subroutine */ int slartv_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slartv_(integer *, real *, integer *, real *, integer *, real *, real *, integer *); integer j1t, j2t; real bii; @@ -765,13 +765,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } inca = *ldab * ka1; @@ -1609,14 +1609,14 @@ f"> */ --i__; i0 = m + 1; if (*ka == 0) { - return 0; + return; } goto L490; } } else { i__ -= *ka; if (i__ < 2) { - return 0; + return; } } diff --git a/lapack-netlib/SRC/ssbgv.c b/lapack-netlib/SRC/ssbgv.c index eb9c965f91..d8d3b2b351 100644 --- a/lapack-netlib/SRC/ssbgv.c +++ b/lapack-netlib/SRC/ssbgv.c @@ -685,7 +685,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int ssbgv_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void ssbgv_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, real *ab, integer *ldab, real *bb, integer *ldbb, real * w, real *z__, integer *ldz, real *work, integer *info) { @@ -700,7 +700,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical upper, wantz; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer indwrk; - extern /* Subroutine */ int spbstf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spbstf_(char *, integer *, integer *, real *, integer *, integer *), ssbtrd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *), ssbgst_(char *, char *, @@ -759,13 +759,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SSBGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -773,7 +773,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ spbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -802,7 +802,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ ssteqr_(jobz, n, &w[1], &work[inde], &z__[z_offset], ldz, &work[ indwrk], info); } - return 0; + return; /* End of SSBGV */ diff --git a/lapack-netlib/SRC/ssbgvd.c b/lapack-netlib/SRC/ssbgvd.c index 0c3ea7b459..cd73f479b5 100644 --- a/lapack-netlib/SRC/ssbgvd.c +++ b/lapack-netlib/SRC/ssbgvd.c @@ -740,7 +740,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int ssbgvd_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void ssbgvd_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, real *ab, integer *ldab, real *bb, integer *ldbb, real * w, real *z__, integer *ldz, real *work, integer *lwork, integer * iwork, integer *liwork, integer *info) @@ -753,18 +753,19 @@ f"> */ char vect[1]; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lwmin; logical upper, wantz; integer indwk2, llwrk2; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sstedc_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sstedc_( char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int spbstf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spbstf_(char *, integer *, integer *, real *, integer *, integer *), ssbtrd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *), ssbgst_(char *, char *, @@ -850,15 +851,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -866,7 +867,7 @@ f"> */ spbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -903,7 +904,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSBGVD */ diff --git a/lapack-netlib/SRC/ssbgvx.c b/lapack-netlib/SRC/ssbgvx.c index 6d33191944..b30fe2b826 100644 --- a/lapack-netlib/SRC/ssbgvx.c +++ b/lapack-netlib/SRC/ssbgvx.c @@ -807,7 +807,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int ssbgvx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void ssbgvx_(char *jobz, char *range, char *uplo, integer *n, integer *ka, integer *kb, real *ab, integer *ldab, real *bb, integer * ldbb, real *q, integer *ldq, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, real @@ -825,10 +825,10 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -838,10 +838,10 @@ f"> */ logical valeig; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer indisp, indiwo; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk; - extern /* Subroutine */ int spbstf_(char *, integer *, integer *, real *, + extern /* Subroutine */ void spbstf_(char *, integer *, integer *, real *, integer *, integer *), ssbtrd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *), ssbgst_(char *, char *, @@ -851,7 +851,7 @@ f"> */ integer *, real *, integer *, real *, integer *, integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), ssteqr_(char *, integer *, real *, @@ -936,14 +936,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -951,7 +951,7 @@ f"> */ spbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -1077,7 +1077,7 @@ f"> */ } } - return 0; + return; /* End of SSBGVX */ diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f index 64a67534e1..271f359641 100644 --- a/lapack-netlib/SRC/ssbgvx.f +++ b/lapack-netlib/SRC/ssbgvx.f @@ -317,7 +317,7 @@ SUBROUTINE SSBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, UPPER, VALEIG, WANTZ CHARACTER ORDER, VECT - INTEGER I, IINFO, INDD, INDE, INDEE, INDIBL, INDISP, + INTEGER I, IINFO, INDD, INDE, INDEE, INDISP, $ INDIWO, INDWRK, ITMP1, J, JJ, NSPLIT REAL TMP1 * .. @@ -457,17 +457,16 @@ SUBROUTINE SSBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWO = INDISP + N CALL SSTEBZ( RANGE, ORDER, N, VL, VU, IL, IU, ABSTOL, $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( 1 ), IWORK( INDISP ), WORK( INDWRK ), $ IWORK( INDIWO ), INFO ) * IF( WANTZ ) THEN CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) * * Apply transformation matrix used in reduction to tridiagonal @@ -497,11 +496,11 @@ SUBROUTINE SSBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, 40 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/ssbtrd.c b/lapack-netlib/SRC/ssbtrd.c index 5a9c788fcb..2664bf8b80 100644 --- a/lapack-netlib/SRC/ssbtrd.c +++ b/lapack-netlib/SRC/ssbtrd.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssbtrd_(char *vect, char *uplo, integer *n, integer *kd, +/* Subroutine */ void ssbtrd_(char *vect, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *d__, real *e, real *q, integer *ldq, real *work, integer *info) { @@ -688,21 +688,22 @@ f"> */ /* Local variables */ integer inca, jend, lend, jinc, incx, last; real temp; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer j1end, j1inc, i__, j, k, l, iqend; extern logical lsame_(char *, char *); logical initq, wantq, upper; integer i2, j1, j2; - extern /* Subroutine */ int slar2v_(integer *, real *, real *, real *, + extern /* Subroutine */ void slar2v_(integer *, real *, real *, real *, integer *, real *, real *, integer *); integer nq, nr, iqaend; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaset_( char *, integer *, integer *, real *, real *, real *, integer *), slartg_(real *, real *, real *, real *, real *), slargv_( integer *, real *, integer *, real *, integer *, real *, integer * ); integer kd1; - extern /* Subroutine */ int slartv_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slartv_(integer *, real *, integer *, real *, integer *, real *, real *, integer *); integer ibl, iqb, kdn, jin, nrt, kdm1; @@ -755,13 +756,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSBTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize Q to the unit matrix, if needed */ @@ -1276,7 +1277,7 @@ f"> */ } } - return 0; + return; /* End of SSBTRD */ diff --git a/lapack-netlib/SRC/ssfrk.c b/lapack-netlib/SRC/ssfrk.c index 9f340381c5..21b52b0bb4 100644 --- a/lapack-netlib/SRC/ssfrk.c +++ b/lapack-netlib/SRC/ssfrk.c @@ -674,7 +674,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssfrk_(char *transr, char *uplo, char *trans, integer *n, +/* Subroutine */ void ssfrk_(char *transr, char *uplo, char *trans, integer *n, integer *k, real *alpha, real *a, integer *lda, real *beta, real * c__) { @@ -685,13 +685,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer info, j; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer nrowa; logical lower; integer n1, n2; - extern /* Subroutine */ int ssyrk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssyrk_(char *, char *, integer *, integer *, real *, real *, integer *, real *, real *, integer *); integer nk; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -743,7 +743,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (info != 0) { i__1 = -info; xerbla_("SSFRK ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ @@ -752,7 +752,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* done (it is in SSYRK for example) and left in the general case. */ if (*n == 0 || (*alpha == 0.f || *k == 0) && *beta == 1.f) { - return 0; + return; } if (*alpha == 0.f && *beta == 0.f) { @@ -760,7 +760,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { c__[j] = 0.f; } - return 0; + return; } /* C is N-by-N. */ @@ -1066,7 +1066,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of SSFRK */ diff --git a/lapack-netlib/SRC/sspcon.c b/lapack-netlib/SRC/sspcon.c index 7172bf6d16..bac0aea5fe 100644 --- a/lapack-netlib/SRC/sspcon.c +++ b/lapack-netlib/SRC/sspcon.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sspcon_(char *uplo, integer *n, real *ap, integer *ipiv, +/* Subroutine */ void sspcon_(char *uplo, integer *n, real *ap, integer *ipiv, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { /* System generated locals */ @@ -648,12 +648,12 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int ssptrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssptrs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *); @@ -687,7 +687,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -695,9 +695,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -709,7 +709,7 @@ f"> */ ip = *n * (*n + 1) / 2; for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && ap[ip] == 0.f) { - return 0; + return; } ip -= i__; /* L10: */ @@ -722,7 +722,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && ap[ip] == 0.f) { - return 0; + return; } ip = ip + *n - i__ + 1; /* L20: */ @@ -748,7 +748,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of SSPCON */ diff --git a/lapack-netlib/SRC/sspev.c b/lapack-netlib/SRC/sspev.c index 4afd4d2143..12d8f9842b 100644 --- a/lapack-netlib/SRC/sspev.c +++ b/lapack-netlib/SRC/sspev.c @@ -644,7 +644,7 @@ atrices */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sspev_(char *jobz, char *uplo, integer *n, real *ap, +/* Subroutine */ void sspev_(char *jobz, char *uplo, integer *n, real *ap, real *w, real *z__, integer *ldz, real *work, integer *info) { /* System generated locals */ @@ -658,7 +658,7 @@ atrices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical wantz; integer iscale; extern real slamch_(char *); @@ -667,9 +667,9 @@ atrices */ real bignum; integer indtau, indwrk; extern real slansp_(char *, char *, integer *, real *, real *); - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); real smlnum; - extern /* Subroutine */ int sopgtr_(char *, integer *, real *, real *, + extern /* Subroutine */ void sopgtr_(char *, integer *, real *, real *, real *, integer *, real *, integer *), ssptrd_(char *, integer *, real *, real *, real *, real *, integer *), ssteqr_(char *, integer *, real *, real *, real *, integer *, @@ -714,13 +714,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -728,7 +728,7 @@ atrices */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -787,7 +787,7 @@ atrices */ sscal_(&imax, &r__1, &w[1], &c__1); } - return 0; + return; /* End of SSPEV */ diff --git a/lapack-netlib/SRC/sspevd.c b/lapack-netlib/SRC/sspevd.c index 2fb736bdc3..6bf7d19a81 100644 --- a/lapack-netlib/SRC/sspevd.c +++ b/lapack-netlib/SRC/sspevd.c @@ -691,7 +691,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sspevd_(char *jobz, char *uplo, integer *n, real *ap, +/* Subroutine */ void sspevd_(char *jobz, char *uplo, integer *n, real *ap, real *w, real *z__, integer *ldz, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -704,7 +704,7 @@ f"> */ real anrm, rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lwmin; logical wantz; integer iscale; @@ -713,18 +713,18 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer indtau; - extern /* Subroutine */ int sstedc_(char *, integer *, real *, real *, + extern /* Subroutine */ void sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *); integer indwrk, liwmin; extern real slansp_(char *, char *, integer *, real *, real *); - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); integer llwork; real smlnum; - extern /* Subroutine */ int ssptrd_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssptrd_(char *, integer *, real *, real *, real *, real *, integer *); logical lquery; - extern /* Subroutine */ int sopmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sopmtr_(char *, char *, char *, integer *, integer *, real *, real *, real *, integer *, real *, integer *); real eps; @@ -793,15 +793,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -809,7 +809,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -868,7 +868,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSPEVD */ diff --git a/lapack-netlib/SRC/sspevx.c b/lapack-netlib/SRC/sspevx.c index 13c3230347..67ece8dfa1 100644 --- a/lapack-netlib/SRC/sspevx.c +++ b/lapack-netlib/SRC/sspevx.c @@ -746,7 +746,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sspevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void sspevx_(char *jobz, char *range, char *uplo, integer *n, real *ap, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, real *work, integer * iwork, integer *ifail, integer *info) @@ -765,9 +765,9 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -781,17 +781,17 @@ f"> */ real abstll, bignum; integer indtau, indisp, indiwo, indwrk; extern real slansp_(char *, char *, integer *, real *, real *); - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int sopgtr_(char *, integer *, real *, real *, + extern /* Subroutine */ void sopgtr_(char *, integer *, real *, real *, real *, integer *, real *, integer *), ssptrd_(char *, integer *, real *, real *, real *, real *, integer *), ssteqr_(char *, integer *, real *, real *, real *, integer *, @@ -860,14 +860,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -883,7 +883,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -1049,7 +1049,7 @@ f"> */ } } - return 0; + return; /* End of SSPEVX */ diff --git a/lapack-netlib/SRC/sspevx.f b/lapack-netlib/SRC/sspevx.f index e33712d58f..6d60ed7aca 100644 --- a/lapack-netlib/SRC/sspevx.f +++ b/lapack-netlib/SRC/sspevx.f @@ -255,7 +255,7 @@ SUBROUTINE SSPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, VALEIG, WANTZ CHARACTER ORDER - INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, $ INDISP, INDIWO, INDTAU, INDWRK, ISCALE, ITMP1, $ J, JJ, NSPLIT REAL ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, @@ -424,17 +424,16 @@ SUBROUTINE SSPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWO = INDISP + N CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ WORK( INDD ), WORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), WORK( INDWRK ), + $ IWORK( 1 ), IWORK( INDISP ), WORK( INDWRK ), $ IWORK( INDIWO ), INFO ) * IF( WANTZ ) THEN CALL SSTEIN( N, WORK( INDD ), WORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ WORK( INDWRK ), IWORK( INDIWO ), IFAIL, INFO ) * * Apply orthogonal matrix used in reduction to tridiagonal @@ -471,11 +470,11 @@ SUBROUTINE SSPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, 30 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/sspgst.c b/lapack-netlib/SRC/sspgst.c index cc70b24371..07b9eb822c 100644 --- a/lapack-netlib/SRC/sspgst.c +++ b/lapack-netlib/SRC/sspgst.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sspgst_(integer *itype, char *uplo, integer *n, real *ap, +/* Subroutine */ void sspgst_(integer *itype, char *uplo, integer *n, real *ap, real *bp, integer *info) { /* System generated locals */ @@ -637,14 +637,14 @@ f"> */ /* Local variables */ extern real sdot_(integer *, real *, integer *, real *, integer *); - extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, + extern /* Subroutine */ void sspr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *); integer j, k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; integer j1, k1; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *), stpmv_( char *, char *, char *, integer *, real *, real *, integer *), stpsv_(char *, char *, char *, integer *, @@ -687,7 +687,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPGST", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -826,7 +826,7 @@ f"> */ } } } - return 0; + return; /* End of SSPGST */ diff --git a/lapack-netlib/SRC/sspgv.c b/lapack-netlib/SRC/sspgv.c index 6ca0b3a4b6..5e9baa716e 100644 --- a/lapack-netlib/SRC/sspgv.c +++ b/lapack-netlib/SRC/sspgv.c @@ -672,7 +672,7 @@ static integer c__1 = 1; /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sspgv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void sspgv_(integer *itype, char *jobz, char *uplo, integer * n, real *ap, real *bp, real *w, real *z__, integer *ldz, real *work, integer *info) { @@ -684,12 +684,14 @@ static integer c__1 = 1; extern logical lsame_(char *, char *); char trans[1]; logical upper; - extern /* Subroutine */ int sspev_(char *, char *, integer *, real *, + extern /* Subroutine */ void sspev_(char *, char *, integer *, real *, real *, real *, integer *, real *, integer *); logical wantz; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpmv_(char *, char *, char *, integer *, real *, real *, integer *), stpsv_(char *, - char *, char *, integer *, real *, real *, integer *), xerbla_(char *, integer *, ftnlen), spptrf_(char + char *, char *, integer *, real *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void spptrf_(char *, integer *, real *, integer *), sspgst_(integer *, char *, integer *, real *, real *, integer *); @@ -733,13 +735,13 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SSPGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -747,7 +749,7 @@ static integer c__1 = 1; spptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -800,7 +802,7 @@ static integer c__1 = 1; } } } - return 0; + return; /* End of SSPGV */ diff --git a/lapack-netlib/SRC/sspgvd.c b/lapack-netlib/SRC/sspgvd.c index b2e0c7e8c1..f63aaaf182 100644 --- a/lapack-netlib/SRC/sspgvd.c +++ b/lapack-netlib/SRC/sspgvd.c @@ -722,7 +722,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int sspgvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void sspgvd_(integer *itype, char *jobz, char *uplo, integer * n, real *ap, real *bp, real *w, real *z__, integer *ldz, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -736,16 +736,17 @@ f"> */ integer lwmin; char trans[1]; logical upper, wantz; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpmv_(char *, char *, char *, integer *, real *, real *, integer *), stpsv_(char *, - char *, char *, integer *, real *, real *, integer *), xerbla_(char *, integer *, ftnlen); + char *, char *, integer *, real *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer liwmin; - extern /* Subroutine */ int sspevd_(char *, char *, integer *, real *, + extern /* Subroutine */ void sspevd_(char *, char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer * , integer *), spptrf_(char *, integer *, real *, integer *); logical lquery; - extern /* Subroutine */ int sspgst_(integer *, char *, integer *, real *, + extern /* Subroutine */ void sspgst_(integer *, char *, integer *, real *, real *, integer *); @@ -815,15 +816,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of BP. */ @@ -831,7 +832,7 @@ f"> */ spptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -895,7 +896,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSPGVD */ diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f index 9db8de08c9..73862ed1b8 100644 --- a/lapack-netlib/SRC/sspgvd.f +++ b/lapack-netlib/SRC/sspgvd.f @@ -307,8 +307,8 @@ SUBROUTINE SSPGVD( ITYPE, JOBZ, UPLO, N, AP, BP, W, Z, LDZ, WORK, CALL SSPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK, $ LIWORK, INFO ) - LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) - LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) + LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) ) + LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/sspgvx.c b/lapack-netlib/SRC/sspgvx.c index 241c34e371..93913d3c76 100644 --- a/lapack-netlib/SRC/sspgvx.c +++ b/lapack-netlib/SRC/sspgvx.c @@ -783,7 +783,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int sspgvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void sspgvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, real *ap, real *bp, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer * ldz, real *work, integer *iwork, integer *ifail, integer *info) @@ -796,11 +796,12 @@ f"> */ extern logical lsame_(char *, char *); char trans[1]; logical upper, wantz; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpmv_(char *, char *, char *, integer *, real *, real *, integer *), stpsv_(char *, char *, char *, integer *, real *, real *, integer *); logical alleig, indeig, valeig; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void spptrf_( char *, integer *, real *, integer *), sspgst_(integer *, char *, integer *, real *, real *, integer *), sspevx_( char *, char *, char *, integer *, real *, real *, real *, @@ -871,14 +872,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -886,7 +887,7 @@ f"> */ spptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -940,7 +941,7 @@ f"> */ } } - return 0; + return; /* End of SSPGVX */ diff --git a/lapack-netlib/SRC/ssprfs.c b/lapack-netlib/SRC/ssprfs.c index a1a16fb764..e2f5ee6288 100644 --- a/lapack-netlib/SRC/ssprfs.c +++ b/lapack-netlib/SRC/ssprfs.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssprfs_(char *uplo, integer *n, integer *nrhs, real *ap, +/* Subroutine */ void ssprfs_(char *uplo, integer *n, integer *nrhs, real *ap, real *afp, integer *ipiv, real *b, integer *ldb, real *x, integer * ldx, real *ferr, real *berr, real *work, integer *iwork, integer * info) @@ -710,7 +710,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *), slacn2_(integer *, @@ -722,7 +722,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int ssptrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssptrs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *); real eps; @@ -770,7 +770,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -782,7 +782,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -985,7 +985,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SSPRFS */ diff --git a/lapack-netlib/SRC/sspsv.c b/lapack-netlib/SRC/sspsv.c index 8d592942be..885c89c307 100644 --- a/lapack-netlib/SRC/sspsv.c +++ b/lapack-netlib/SRC/sspsv.c @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sspsv_(char *uplo, integer *n, integer *nrhs, real *ap, +/* Subroutine */ void sspsv_(char *uplo, integer *n, integer *nrhs, real *ap, integer *ipiv, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -679,7 +679,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ssptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ssptrf_( char *, integer *, real *, integer *, integer *), ssptrs_( char *, integer *, integer *, real *, integer *, real *, integer * , integer *); @@ -717,7 +718,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SSPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -730,7 +731,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ ssptrs_(uplo, n, nrhs, &ap[1], &ipiv[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of SSPSV */ diff --git a/lapack-netlib/SRC/sspsvx.c b/lapack-netlib/SRC/sspsvx.c index c831a6a9f3..52e97e7517 100644 --- a/lapack-netlib/SRC/sspsvx.c +++ b/lapack-netlib/SRC/sspsvx.c @@ -788,7 +788,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sspsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void sspsvx_(char *fact, char *uplo, integer *n, integer * nrhs, real *ap, real *afp, integer *ipiv, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, integer *iwork, integer *info) @@ -799,15 +799,16 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); extern real slamch_(char *); logical nofact; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ); extern real slansp_(char *, char *, integer *, real *, real *); - extern /* Subroutine */ int sspcon_(char *, integer *, real *, integer *, + extern /* Subroutine */ void sspcon_(char *, integer *, real *, integer *, real *, real *, real *, integer *, integer *), ssprfs_( char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, @@ -862,7 +863,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -877,7 +878,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -907,7 +908,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of SSPSVX */ diff --git a/lapack-netlib/SRC/ssptrd.c b/lapack-netlib/SRC/ssptrd.c index f8976db3af..df599e7821 100644 --- a/lapack-netlib/SRC/ssptrd.c +++ b/lapack-netlib/SRC/ssptrd.c @@ -665,7 +665,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssptrd_(char *uplo, integer *n, real *ap, real *d__, +/* Subroutine */ void ssptrd_(char *uplo, integer *n, real *ap, real *d__, real *e, real *tau, integer *info) { /* System generated locals */ @@ -675,17 +675,18 @@ f"> */ real taui; extern real sdot_(integer *, real *, integer *, real *, integer *); integer i__; - extern /* Subroutine */ int sspr2_(char *, integer *, real *, real *, + extern /* Subroutine */ void sspr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *); real alpha; extern logical lsame_(char *, char *); integer i1; logical upper; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *); integer ii; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarfg_( integer *, real *, real *, integer *, real *); integer i1i1; @@ -718,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -826,7 +827,7 @@ f"> */ d__[*n] = ap[ii]; } - return 0; + return; /* End of SSPTRD */ diff --git a/lapack-netlib/SRC/ssptrf.c b/lapack-netlib/SRC/ssptrf.c index fad8532f7b..89d5414d97 100644 --- a/lapack-netlib/SRC/ssptrf.c +++ b/lapack-netlib/SRC/ssptrf.c @@ -670,7 +670,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssptrf_(char *uplo, integer *n, real *ap, integer *ipiv, +/* Subroutine */ void ssptrf_(char *uplo, integer *n, real *ap, integer *ipiv, integer *info) { /* System generated locals */ @@ -679,15 +679,15 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int sspr_(char *, integer *, real *, real *, + extern /* Subroutine */ void sspr_(char *, integer *, real *, real *, integer *, real *); integer i__, j, k; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real r1, d11, d12, d21, d22; integer kc, kk, kp; @@ -726,7 +726,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1165,7 +1165,7 @@ f"> */ } L110: - return 0; + return; /* End of SSPTRF */ diff --git a/lapack-netlib/SRC/ssptri.c b/lapack-netlib/SRC/ssptri.c index 79e24f37bf..ee09a21a4c 100644 --- a/lapack-netlib/SRC/ssptri.c +++ b/lapack-netlib/SRC/ssptri.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssptri_(char *uplo, integer *n, real *ap, integer *ipiv, +/* Subroutine */ void ssptri_(char *uplo, integer *n, real *ap, integer *ipiv, real *work, integer *info) { /* System generated locals */ @@ -640,7 +640,7 @@ f"> */ extern logical lsame_(char *, char *); integer kstep; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), sspmv_(char *, integer *, real *, real *, real *, integer *, real *, real *, integer *); @@ -678,13 +678,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -696,7 +696,7 @@ f"> */ kp = *n * (*n + 1) / 2; for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && ap[kp] == 0.f) { - return 0; + return; } kp -= *info; /* L10: */ @@ -709,7 +709,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && ap[kp] == 0.f) { - return 0; + return; } kp = kp + *n - *info + 1; /* L20: */ @@ -950,7 +950,7 @@ f"> */ ; } - return 0; + return; /* End of SSPTRI */ diff --git a/lapack-netlib/SRC/ssptrs.c b/lapack-netlib/SRC/ssptrs.c index c7b5c05ee9..f6f2cb403b 100644 --- a/lapack-netlib/SRC/ssptrs.c +++ b/lapack-netlib/SRC/ssptrs.c @@ -630,7 +630,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssptrs_(char *uplo, integer *n, integer *nrhs, real *ap, +/* Subroutine */ void ssptrs_(char *uplo, integer *n, integer *nrhs, real *ap, integer *ipiv, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -638,17 +638,17 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real akm1k; integer j, k; extern logical lsame_(char *, char *); real denom; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real ak, bk; integer kc, kp; @@ -687,13 +687,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1000,7 +1000,7 @@ f"> */ ; } - return 0; + return; /* End of SSPTRS */ diff --git a/lapack-netlib/SRC/sstebz.c b/lapack-netlib/SRC/sstebz.c index e73f9ce9c7..ebe4d0c96a 100644 --- a/lapack-netlib/SRC/sstebz.c +++ b/lapack-netlib/SRC/sstebz.c @@ -789,7 +789,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sstebz_(char *range, char *order, integer *n, real *vl, +/* Subroutine */ void sstebz_(char *range, char *order, integer *n, real *vl, real *vu, integer *il, integer *iu, real *abstol, real *d__, real *e, integer *m, integer *nsplit, real *w, integer *iblock, integer * isplit, real *work, integer *iwork, integer *info) @@ -821,7 +821,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer idiscu; - extern /* Subroutine */ int slaebz_(integer *, integer *, integer *, + extern /* Subroutine */ void slaebz_(integer *, integer *, integer *, integer *, integer *, integer *, real *, real *, real *, real *, real *, real *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); @@ -899,7 +899,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEBZ", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize error flags */ @@ -912,7 +912,7 @@ f"> */ *m = 0; if (*n == 0) { - return 0; + return; } /* Simplifications: */ @@ -946,7 +946,7 @@ f"> */ iblock[1] = 1; *m = 1; } - return 0; + return; } /* Compute Splitting Points */ @@ -1060,7 +1060,7 @@ f"> */ if (nwl < 0 || nwl >= *n || nwu < 1 || nwu > *n) { *info = 4; - return 0; + return; } } else { @@ -1363,7 +1363,7 @@ f"> */ if (toofew) { *info += 2; } - return 0; + return; /* End of SSTEBZ */ diff --git a/lapack-netlib/SRC/sstedc.c b/lapack-netlib/SRC/sstedc.c index 46ed15a1af..10ea1249fe 100644 --- a/lapack-netlib/SRC/sstedc.c +++ b/lapack-netlib/SRC/sstedc.c @@ -705,7 +705,7 @@ f"> */ /* > Modified by Francoise Tisseur, University of Tennessee */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sstedc_(char *compz, integer *n, real *d__, real *e, +/* Subroutine */ void sstedc_(char *compz, integer *n, real *d__, real *e, real *z__, integer *ldz, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -718,11 +718,11 @@ f"> */ integer i__, j, k, m; real p; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lwmin, start; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), slaed0_(integer *, integer *, integer *, real *, real *, real *, integer *, real *, integer *, real *, integer *, integer *); @@ -732,18 +732,18 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer finish; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer liwmin, icompz; real orgnrm; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *), + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *), slasrt_(char *, integer *, real *, integer *); logical lquery; integer smlsiz; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); integer storez, strtrw, lgn; real eps; @@ -804,10 +804,10 @@ f"> */ lwmin = *n - 1 << 1; } else { lgn = (integer) (log((real) (*n)) / log(2.f)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } if (icompz == 1) { @@ -835,21 +835,21 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEDC", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz != 0) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* If the following conditional clause is removed, then the routine */ @@ -1031,7 +1031,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSTEDC */ diff --git a/lapack-netlib/SRC/sstegr.c b/lapack-netlib/SRC/sstegr.c index 28c581d075..3971609e49 100644 --- a/lapack-netlib/SRC/sstegr.c +++ b/lapack-netlib/SRC/sstegr.c @@ -772,7 +772,7 @@ f"> */ /* > Christof Voemel, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int sstegr_(char *jobz, char *range, integer *n, real *d__, +/* Subroutine */ void sstegr_(char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, integer *isuppz, real * work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -782,7 +782,7 @@ f"> */ /* Local variables */ logical tryrac; - extern /* Subroutine */ int sstemr_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstemr_(char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, real *, integer *, integer *, integer *, logical *, real *, integer *, integer *, integer *, integer *); @@ -816,6 +816,6 @@ f"> */ /* End of SSTEGR */ - return 0; + return; } /* sstegr_ */ diff --git a/lapack-netlib/SRC/sstein.c b/lapack-netlib/SRC/sstein.c index 4eeb3d19b8..42c85fa5b2 100644 --- a/lapack-netlib/SRC/sstein.c +++ b/lapack-netlib/SRC/sstein.c @@ -688,7 +688,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int sstein_(integer *n, real *d__, real *e, integer *m, real +/* Subroutine */ void sstein_(integer *n, real *d__, real *e, integer *m, real *w, integer *iblock, integer *isplit, real *z__, integer *ldz, real * work, integer *iwork, integer *ifail, integer *info) { @@ -701,26 +701,27 @@ f"> */ extern real sdot_(integer *, real *, integer *, real *, integer *), snrm2_(integer *, real *, integer *); integer i__, j, iseed[4], gpind, iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer b1, j1; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real ortol; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); integer indrv1, indrv2, indrv3, indrv4, indrv5, bn; real xj; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slagtf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slagtf_( integer *, real *, real *, real *, real *, real *, real *, integer *, integer *); integer nrmchk; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slagts_(integer *, integer *, real *, real *, + extern /* Subroutine */ void slagts_(integer *, integer *, real *, real *, real *, real *, integer *, real *, real *, integer *); integer blksiz; real onenrm, pertol; - extern /* Subroutine */ int slarnv_(integer *, integer *, integer *, real + extern /* Subroutine */ void slarnv_(integer *, integer *, integer *, real *); real stpcrt, scl, eps, ctr, sep, nrm, tol; integer its; @@ -785,16 +786,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } else if (*n == 1) { z__[z_dim1 + 1] = 1.f; - return 0; + return; } /* Get machine constants. */ @@ -1009,7 +1010,7 @@ f"> */ ; } - return 0; + return; /* End of SSTEIN */ diff --git a/lapack-netlib/SRC/sstemr.c b/lapack-netlib/SRC/sstemr.c index b531916892..f008b70459 100644 --- a/lapack-netlib/SRC/sstemr.c +++ b/lapack-netlib/SRC/sstemr.c @@ -833,7 +833,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int sstemr_(char *jobz, char *range, integer *n, real *d__, +/* Subroutine */ void sstemr_(char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, integer *m, real *w, real *z__, integer *ldz, integer *nzc, integer *isuppz, logical *tryrac, real *work, integer *lwork, integer *iwork, integer * @@ -849,7 +849,7 @@ f"> */ integer itmp; real tnrm; integer inde2; - extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slae2_(real *, real *, real *, real *, real *) ; integer itmp2; real rtol1, rtol2; @@ -858,14 +858,14 @@ f"> */ integer indgp; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer iindw, ilast, lwmin; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; real r1, r2; - extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slaev2_(real *, real *, real *, real *, real * , real *, real *); integer jj; real cs; @@ -880,7 +880,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer inderr, iindwk, indgrs, offset; - extern /* Subroutine */ int slarrc_(char *, integer *, real *, real *, + extern /* Subroutine */ void slarrc_(char *, integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer * ), slarre_(char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, real *, real *, real *, @@ -891,18 +891,18 @@ f"> */ integer iinspl, indwrk, ifirst, liwmin, nzcmin; real pivmin; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slarrj_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slarrj_(integer *, real *, real *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *), slarrr_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int slarrv_(integer *, real *, real *, real *, + extern /* Subroutine */ void slarrv_(integer *, real *, real *, real *, real *, real *, integer *, integer *, integer *, integer *, real * , real *, real *, real *, real *, real *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer * ); real smlnum; - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); logical lquery, zquery; integer iil, iiu; real eps, tmp; @@ -1024,16 +1024,16 @@ f"> */ i__1 = -(*info); xerbla_("SSTEMR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery || zquery) { - return 0; + return; } /* Handle N = 0, 1, and 2 cases immediately */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1051,7 +1051,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } if (*n == 2) { @@ -1201,7 +1201,7 @@ f"> */ work[indgrs], &pivmin, &work[indwrk], &iwork[iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 10; - return 0; + return; } /* Note that if RANGE .NE. 'V', SLARRE computes bounds on the desired */ /* part of the spectrum. All desired eigenvalues are contained in */ @@ -1218,7 +1218,7 @@ f"> */ iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 20; - return 0; + return; } } else { /* SLARRE computes eigenvalues of the (shifted) root representation */ @@ -1287,7 +1287,7 @@ f"> */ slasrt_("I", m, &w[1], &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } } else { i__1 = *m - 1; @@ -1324,7 +1324,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSTEMR */ diff --git a/lapack-netlib/SRC/ssteqr.c b/lapack-netlib/SRC/ssteqr.c index 02ca293c7f..8a43b9425d 100644 --- a/lapack-netlib/SRC/ssteqr.c +++ b/lapack-netlib/SRC/ssteqr.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssteqr_(char *compz, integer *n, real *d__, real *e, +/* Subroutine */ void ssteqr_(char *compz, integer *n, real *d__, real *e, real *z__, integer *ldz, real *work, integer *info) { /* System generated locals */ @@ -657,20 +657,20 @@ f"> */ /* Local variables */ integer lend, jtot; - extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slae2_(real *, real *, real *, real *, real *) ; real b, c__, f, g; integer i__, j, k, l, m; real p, r__, s; extern logical lsame_(char *, char *); real anorm; - extern /* Subroutine */ int slasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void slasr_(char *, char *, char *, integer *, integer *, real *, real *, real *, integer *); integer l1; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer lendm1, lendp1; - extern /* Subroutine */ int slaev2_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slaev2_(real *, real *, real *, real *, real * , real *, real *); extern real slapy2_(real *, real *); integer ii, mm, iscale; @@ -678,17 +678,17 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real safmax; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer lendsv; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real ssfmin; integer nmaxit, icompz; real ssfmax; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); integer lm1, mm1, nm1; real rt1, rt2, eps; integer lsv; @@ -736,20 +736,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz == 2) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Determine the unit roundoff and over/underflow thresholds. */ @@ -1163,7 +1163,7 @@ f"> */ } L190: - return 0; + return; /* End of SSTEQR */ diff --git a/lapack-netlib/SRC/ssterf.c b/lapack-netlib/SRC/ssterf.c index 4581f3cfe0..3e815cfc96 100644 --- a/lapack-netlib/SRC/ssterf.c +++ b/lapack-netlib/SRC/ssterf.c @@ -601,7 +601,7 @@ f"> */ /* > \ingroup auxOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssterf_(integer *n, real *d__, real *e, integer *info) +/* Subroutine */ void ssterf_(integer *n, real *d__, real *e, integer *info) { /* System generated locals */ integer i__1; @@ -610,7 +610,7 @@ f"> */ /* Local variables */ real oldc; integer lend, jtot; - extern /* Subroutine */ int slae2_(real *, real *, real *, real *, real *) + extern /* Subroutine */ void slae2_(real *, real *, real *, real *, real *) ; real c__; integer i__, l, m; @@ -624,14 +624,14 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real safmax; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer lendsv; real ssfmin; integer nmaxit; real ssfmax; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + extern /* Subroutine */ void slasrt_(char *, integer *, real *, integer *); real rt1, rt2, eps, rte; integer lsv; real eps2; @@ -661,10 +661,10 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("SSTERF", &i__1, (ftnlen)6); - return 0; + return; } if (*n <= 1) { - return 0; + return; } /* Determine the unit roundoff for this environment. */ @@ -994,7 +994,7 @@ f"> */ slasrt_("I", n, &d__[1], info); L180: - return 0; + return; /* End of SSTERF */ diff --git a/lapack-netlib/SRC/sstev.c b/lapack-netlib/SRC/sstev.c index 0a38320b39..fc6a681481 100644 --- a/lapack-netlib/SRC/sstev.c +++ b/lapack-netlib/SRC/sstev.c @@ -630,7 +630,7 @@ atrices */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sstev_(char *jobz, integer *n, real *d__, real *e, real * +/* Subroutine */ void sstev_(char *jobz, integer *n, real *d__, real *e, real * z__, integer *ldz, real *work, integer *info) { /* System generated locals */ @@ -641,7 +641,7 @@ atrices */ integer imax; real rmin, rmax, tnrm, sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical wantz; integer iscale; extern real slamch_(char *); @@ -649,9 +649,9 @@ atrices */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); real smlnum; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); real eps; @@ -690,20 +690,20 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -753,7 +753,7 @@ atrices */ sscal_(&imax, &r__1, &d__[1], &c__1); } - return 0; + return; /* End of SSTEV */ diff --git a/lapack-netlib/SRC/sstevd.c b/lapack-netlib/SRC/sstevd.c index fc6d24f4eb..dfb1f5d741 100644 --- a/lapack-netlib/SRC/sstevd.c +++ b/lapack-netlib/SRC/sstevd.c @@ -676,7 +676,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sstevd_(char *jobz, integer *n, real *d__, real *e, real +/* Subroutine */ void sstevd_(char *jobz, integer *n, real *d__, real *e, real *z__, integer *ldz, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -687,7 +687,7 @@ f"> */ /* Local variables */ real rmin, rmax, tnrm, sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lwmin; logical wantz; integer iscale; @@ -695,12 +695,12 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int sstedc_(char *, integer *, real *, real *, + extern /* Subroutine */ void sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *); integer liwmin; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); real smlnum; logical lquery; real eps; @@ -762,22 +762,22 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -826,7 +826,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSTEVD */ diff --git a/lapack-netlib/SRC/sstevr.c b/lapack-netlib/SRC/sstevr.c index 3733acf249..13bcca0e2c 100644 --- a/lapack-netlib/SRC/sstevr.c +++ b/lapack-netlib/SRC/sstevr.c @@ -822,7 +822,7 @@ f"> */ /* > California at Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int sstevr_(char *jobz, char *range, integer *n, real *d__, +/* Subroutine */ void sstevr_(char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, integer *isuppz, real * work, integer *lwork, integer *iwork, integer *liwork, integer *info) @@ -839,10 +839,10 @@ f"> */ integer i__, j; real sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer lwmin; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -859,17 +859,17 @@ f"> */ integer indisp, indiwo, liwmin; logical tryrac; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int sstemr_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstemr_(char *, char *, integer *, real *, real *, real *, real *, integer *, integer *, integer *, real *, real *, integer *, integer *, integer *, logical *, real *, integer *, integer *, integer *, integer *); @@ -958,16 +958,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEVR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -983,7 +983,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -1139,7 +1139,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSTEVR */ diff --git a/lapack-netlib/SRC/sstevx.c b/lapack-netlib/SRC/sstevx.c index 2d8c50481c..d6ce747d6a 100644 --- a/lapack-netlib/SRC/sstevx.c +++ b/lapack-netlib/SRC/sstevx.c @@ -740,7 +740,7 @@ f"> */ /* > \ingroup realOTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int sstevx_(char *jobz, char *range, integer *n, real *d__, +/* Subroutine */ void sstevx_(char *jobz, char *range, integer *n, real *d__, real *e, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, real *work, integer * iwork, integer *ifail, integer *info) @@ -757,9 +757,9 @@ f"> */ integer itmp1, i__, j; real sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -773,17 +773,17 @@ f"> */ real bignum; integer indisp, indiwo, indwrk; extern real slanst_(char *, integer *, real *, real *); - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer nsplit; - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real smlnum; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); real eps, vll, vuu, tmp1; @@ -845,14 +845,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSTEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -868,7 +868,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -1014,7 +1014,7 @@ f"> */ } } - return 0; + return; /* End of SSTEVX */ diff --git a/lapack-netlib/SRC/sstevx.f b/lapack-netlib/SRC/sstevx.f index 570864e6e0..6d8c8e5cab 100644 --- a/lapack-netlib/SRC/sstevx.f +++ b/lapack-netlib/SRC/sstevx.f @@ -248,7 +248,7 @@ SUBROUTINE SSTEVX( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, ABSTOL, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, VALEIG, WANTZ CHARACTER ORDER - INTEGER I, IMAX, INDIBL, INDISP, INDIWO, INDWRK, + INTEGER I, IMAX, INDISP, INDIWO, INDWRK, $ ISCALE, ITMP1, J, JJ, NSPLIT REAL BIGNUM, EPS, RMAX, RMIN, SAFMIN, SIGMA, SMLNUM, $ TMP1, TNRM, VLL, VUU @@ -399,15 +399,14 @@ SUBROUTINE SSTEVX( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, ABSTOL, ORDER = 'E' END IF INDWRK = 1 - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWO = INDISP + N CALL SSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTOL, D, E, M, - $ NSPLIT, W, IWORK( INDIBL ), IWORK( INDISP ), + $ NSPLIT, W, IWORK( 1 ), IWORK( INDISP ), $ WORK( INDWRK ), IWORK( INDIWO ), INFO ) * IF( WANTZ ) THEN - CALL SSTEIN( N, D, E, M, W, IWORK( INDIBL ), IWORK( INDISP ), + CALL SSTEIN( N, D, E, M, W, IWORK( 1 ), IWORK( INDISP ), $ Z, LDZ, WORK( INDWRK ), IWORK( INDIWO ), IFAIL, $ INFO ) END IF @@ -439,11 +438,11 @@ SUBROUTINE SSTEVX( JOBZ, RANGE, N, D, E, VL, VU, IL, IU, ABSTOL, 30 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL SSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/ssycon.c b/lapack-netlib/SRC/ssycon.c index 6d15c1e5a3..794ba94b3e 100644 --- a/lapack-netlib/SRC/ssycon.c +++ b/lapack-netlib/SRC/ssycon.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssycon_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssycon_(char *uplo, integer *n, real *a, integer *lda, integer *ipiv, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { @@ -654,10 +654,11 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, - real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, + real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; - extern /* Subroutine */ int ssytrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssytrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -695,7 +696,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -703,9 +704,9 @@ f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -716,7 +717,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -727,7 +728,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.f) { - return 0; + return; } /* L20: */ } @@ -753,7 +754,7 @@ f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of SSYCON */ diff --git a/lapack-netlib/SRC/ssycon_3.c b/lapack-netlib/SRC/ssycon_3.c index 4d399327d6..0352febd02 100644 --- a/lapack-netlib/SRC/ssycon_3.c +++ b/lapack-netlib/SRC/ssycon_3.c @@ -683,7 +683,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssycon_3_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssycon_3_(char *uplo, integer *n, real *a, integer *lda, real *e, integer *ipiv, real *anorm, real *rcond, real *work, integer *iwork, integer *info) { @@ -692,14 +692,15 @@ static integer c__1 = 1; /* Local variables */ integer kase; - extern /* Subroutine */ int ssytrs_3_(char *, integer *, integer *, real + extern /* Subroutine */ void ssytrs_3_(char *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); integer i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, - real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, + real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; @@ -738,7 +739,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYCON_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ @@ -746,9 +747,9 @@ static integer c__1 = 1; *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -759,7 +760,7 @@ static integer c__1 = 1; for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.f) { - return 0; + return; } } } else { @@ -769,7 +770,7 @@ static integer c__1 = 1; i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.f) { - return 0; + return; } } } @@ -794,7 +795,7 @@ static integer c__1 = 1; *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of DSYCON_3 */ diff --git a/lapack-netlib/SRC/ssycon_rook.c b/lapack-netlib/SRC/ssycon_rook.c index 7bdd83ff2a..d6bde6e4f7 100644 --- a/lapack-netlib/SRC/ssycon_rook.c +++ b/lapack-netlib/SRC/ssycon_rook.c @@ -656,7 +656,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssycon_rook_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssycon_rook_(char *uplo, integer *n, real *a, integer * lda, integer *ipiv, real *anorm, real *rcond, real *work, integer * iwork, integer *info) { @@ -665,14 +665,15 @@ rook.f"> */ /* Local variables */ integer kase; - extern /* Subroutine */ int ssytrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void ssytrs_rook_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); integer i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, - real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, + real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real ainvnm; @@ -710,7 +711,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYCON_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ @@ -718,9 +719,9 @@ rook.f"> */ *rcond = 0.f; if (*n == 0) { *rcond = 1.f; - return 0; + return; } else if (*anorm <= 0.f) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -731,7 +732,7 @@ rook.f"> */ for (i__ = *n; i__ >= 1; --i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -742,7 +743,7 @@ rook.f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (ipiv[i__] > 0 && a[i__ + i__ * a_dim1] == 0.f) { - return 0; + return; } /* L20: */ } @@ -768,7 +769,7 @@ rook.f"> */ *rcond = 1.f / ainvnm / *anorm; } - return 0; + return; /* End of SSYCON_ROOK */ diff --git a/lapack-netlib/SRC/ssyconv.c b/lapack-netlib/SRC/ssyconv.c index d17701c6cc..a69acae8ea 100644 --- a/lapack-netlib/SRC/ssyconv.c +++ b/lapack-netlib/SRC/ssyconv.c @@ -623,7 +623,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssyconv_(char *uplo, char *way, integer *n, real *a, +/* Subroutine */ void ssyconv_(char *uplo, char *way, integer *n, real *a, integer *lda, integer *ipiv, real *e, integer *info) { /* System generated locals */ @@ -672,13 +672,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SSYCONV", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -880,7 +880,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } } - return 0; + return; /* End of SSYCONV */ diff --git a/lapack-netlib/SRC/ssyconvf.c b/lapack-netlib/SRC/ssyconvf.c index 69034dc925..d6a8a52501 100644 --- a/lapack-netlib/SRC/ssyconvf.c +++ b/lapack-netlib/SRC/ssyconvf.c @@ -715,7 +715,7 @@ f.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssyconvf_(char *uplo, char *way, integer *n, real *a, +/* Subroutine */ void ssyconvf_(char *uplo, char *way, integer *n, real *a, integer *lda, real *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -725,7 +725,7 @@ f.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -765,13 +765,13 @@ f.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYCONVF", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1072,7 +1072,7 @@ f.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of SSYCONVF */ diff --git a/lapack-netlib/SRC/ssyconvf_rook.c b/lapack-netlib/SRC/ssyconvf_rook.c index 7aa85940cc..4c2475fc2f 100644 --- a/lapack-netlib/SRC/ssyconvf_rook.c +++ b/lapack-netlib/SRC/ssyconvf_rook.c @@ -706,7 +706,7 @@ f_rook.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssyconvf_rook_(char *uplo, char *way, integer *n, real * +/* Subroutine */ void ssyconvf_rook_(char *uplo, char *way, integer *n, real * a, integer *lda, real *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -716,7 +716,7 @@ f_rook.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -757,13 +757,13 @@ f_rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYCONVF_ROOK", &i__1, (ftnlen)13); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1062,7 +1062,7 @@ f_rook.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of SSYCONVF_ROOK */ diff --git a/lapack-netlib/SRC/ssyequb.c b/lapack-netlib/SRC/ssyequb.c index 31e4564076..5e4a09c739 100644 --- a/lapack-netlib/SRC/ssyequb.c +++ b/lapack-netlib/SRC/ssyequb.c @@ -644,7 +644,7 @@ static integer c__1 = 1; /* > Tech report version: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.1679 */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssyequb_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssyequb_(char *uplo, integer *n, real *a, integer *lda, real *s, real *scond, real *amax, real *work, integer *info) { /* System generated locals */ @@ -663,7 +663,7 @@ static integer c__1 = 1; extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real smlnum, avg, std, tol; @@ -698,7 +698,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYEQUB", &i__1, (ftnlen)7); - return 0; + return; } up = lsame_(uplo, "U"); *amax = 0.f; @@ -707,7 +707,7 @@ static integer c__1 = 1; if (*n == 0) { *scond = 1.f; - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -824,7 +824,7 @@ static integer c__1 = 1; d__ = c1 * c1 - c0 * 4 * c2; if (d__ <= 0.f) { *info = -1; - return 0; + return; } si = c0 * -2 / (c1 + sqrt(d__)); d__ = si - s[i__]; @@ -881,6 +881,6 @@ static integer c__1 = 1; } *scond = f2cmax(smin,smlnum) / f2cmin(smax,bignum); - return 0; + return; } /* ssyequb_ */ diff --git a/lapack-netlib/SRC/ssyev.c b/lapack-netlib/SRC/ssyev.c index 27fc4c3182..e861c0bc33 100644 --- a/lapack-netlib/SRC/ssyev.c +++ b/lapack-netlib/SRC/ssyev.c @@ -649,7 +649,7 @@ ices */ /* > \ingroup realSYeigen */ /* ===================================================================== */ -/* Subroutine */ int ssyev_(char *jobz, char *uplo, integer *n, real *a, +/* Subroutine */ void ssyev_(char *jobz, char *uplo, integer *n, real *a, integer *lda, real *w, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -663,7 +663,7 @@ ices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical lower, wantz; integer nb, iscale; extern real slamch_(char *); @@ -672,16 +672,16 @@ ices */ integer *, integer *, ftnlen, ftnlen); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indtau, indwrk; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); integer llwork; real smlnum; integer lwkopt; logical lquery; - extern /* Subroutine */ int sorgtr_(char *, integer *, real *, integer *, + extern /* Subroutine */ void sorgtr_(char *, integer *, real *, integer *, real *, real *, integer *, integer *), ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *), ssytrd_(char *, integer *, real *, integer *, real *, real *, real *, real *, integer *, integer *); @@ -740,15 +740,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -757,7 +757,7 @@ ices */ if (wantz) { a[a_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -822,7 +822,7 @@ ices */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYEV */ diff --git a/lapack-netlib/SRC/ssyev_2stage.c b/lapack-netlib/SRC/ssyev_2stage.c index a2790dd149..dea80d02e5 100644 --- a/lapack-netlib/SRC/ssyev_2stage.c +++ b/lapack-netlib/SRC/ssyev_2stage.c @@ -702,7 +702,7 @@ SY matrices */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssyev_2stage_(char *jobz, char *uplo, integer *n, real * +/* Subroutine */ void ssyev_2stage_(char *jobz, char *uplo, integer *n, real * a, integer *lda, real *w, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -718,10 +718,10 @@ SY matrices */ real rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lhtrd, lwmin; logical lower; - extern /* Subroutine */ int ssytrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void ssytrd_2stage_(char *, char *, integer *, real *, integer *, real *, real *, real *, real *, integer *, real *, integer *, integer *); integer lwtrd; @@ -731,17 +731,17 @@ SY matrices */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indtau, indwrk; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); integer llwork; real smlnum; - extern /* Subroutine */ int sorgtr_(char *, integer *, real *, integer *, + extern /* Subroutine */ void sorgtr_(char *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *); real eps; integer indhous; @@ -801,15 +801,15 @@ SY matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -818,7 +818,7 @@ SY matrices */ if (wantz) { a[a_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -866,7 +866,7 @@ SY matrices */ } else { /* Not available in this release, and argument checking should not */ /* let it getting here */ - return 0; + return; sorgtr_(uplo, n, &a[a_offset], lda, &work[indtau], &work[indwrk], & llwork, &iinfo); ssteqr_(jobz, n, &w[1], &work[inde], &a[a_offset], lda, &work[indtau], @@ -889,7 +889,7 @@ SY matrices */ work[1] = (real) lwmin; - return 0; + return; /* End of SSYEV_2STAGE */ diff --git a/lapack-netlib/SRC/ssyevd.c b/lapack-netlib/SRC/ssyevd.c index 6ec3bf7a40..86f3494dce 100644 --- a/lapack-netlib/SRC/ssyevd.c +++ b/lapack-netlib/SRC/ssyevd.c @@ -699,7 +699,7 @@ f"> */ /* > Modified description of INFO. Sven, 16 Feb 05. \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssyevd_(char *jobz, char *uplo, integer *n, real *a, +/* Subroutine */ void ssyevd_(char *jobz, char *uplo, integer *n, real *a, integer *lda, real *w, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -714,7 +714,7 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lwmin, liopt; logical lower, wantz; integer indwk2, llwrk2, iscale; @@ -724,20 +724,20 @@ f"> */ integer *, integer *, ftnlen, ftnlen); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indtau; - extern /* Subroutine */ int sstedc_(char *, integer *, real *, real *, + extern /* Subroutine */ void sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); integer llwork; real smlnum; logical lquery; - extern /* Subroutine */ int sormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormtr_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), ssytrd_(char *, integer *, real *, integer *, real *, real *, real *, real *, @@ -816,15 +816,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -832,7 +832,7 @@ f"> */ if (wantz) { a[a_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -897,7 +897,7 @@ f"> */ work[1] = (real) lopt; iwork[1] = liopt; - return 0; + return; /* End of SSYEVD */ diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f index 8b90d9263c..ac0d0284d3 100644 --- a/lapack-netlib/SRC/ssyevd.f +++ b/lapack-netlib/SRC/ssyevd.f @@ -255,7 +255,7 @@ SUBROUTINE SSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LWMIN = 2*N + 1 END IF LOPT = MAX( LWMIN, 2*N + - $ ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) ) LIOPT = LIWMIN END IF WORK( 1 ) = LOPT diff --git a/lapack-netlib/SRC/ssyevd_2stage.c b/lapack-netlib/SRC/ssyevd_2stage.c index a4146655d1..19d366874b 100644 --- a/lapack-netlib/SRC/ssyevd_2stage.c +++ b/lapack-netlib/SRC/ssyevd_2stage.c @@ -746,7 +746,7 @@ static real c_b27 = 1.f; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssyevd_2stage_(char *jobz, char *uplo, integer *n, real +/* Subroutine */ void ssyevd_2stage_(char *jobz, char *uplo, integer *n, real *a, integer *lda, real *w, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -761,11 +761,11 @@ static real c_b27 = 1.f; real anrm, rmin, rmax, sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer lhtrd, lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int ssytrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void ssytrd_2stage_(char *, char *, integer *, real *, integer *, real *, real *, real *, real *, integer *, real *, integer *, integer *); logical wantz; @@ -774,20 +774,20 @@ static real c_b27 = 1.f; real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; - extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, real *, integer *, integer *, real *, integer *, integer *); integer indtau; - extern /* Subroutine */ int sstedc_(char *, integer *, real *, real *, + extern /* Subroutine */ void sstedc_(char *, integer *, real *, real *, real *, integer *, real *, integer *, integer *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int ssterf_(integer *, real *, real *, integer *); + extern /* Subroutine */ void ssterf_(integer *, real *, real *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); integer llwork; real smlnum; logical lquery; - extern /* Subroutine */ int sormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormtr_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real eps; @@ -867,15 +867,15 @@ static real c_b27 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SSYEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -883,7 +883,7 @@ static real c_b27 = 1.f; if (wantz) { a[a_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -935,7 +935,7 @@ static real c_b27 = 1.f; } else { /* Not available in this release, and argument checking should not */ /* let it getting here */ - return 0; + return; sstedc_("I", n, &w[1], &work[inde], &work[indwrk], n, &work[indwk2], & llwrk2, &iwork[1], liwork, info); sormtr_("L", uplo, "N", n, n, &a[a_offset], lda, &work[indtau], &work[ @@ -953,7 +953,7 @@ static real c_b27 = 1.f; work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSYEVD_2STAGE */ diff --git a/lapack-netlib/SRC/ssyevr.c b/lapack-netlib/SRC/ssyevr.c index 680a1a58be..287656d0d9 100644 --- a/lapack-netlib/SRC/ssyevr.c +++ b/lapack-netlib/SRC/ssyevr.c @@ -853,7 +853,7 @@ f"> */ /* > California at Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssyevr_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void ssyevr_(char *jobz, char *range, char *uplo, integer *n, real *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, integer * isuppz, real *work, integer *lwork, integer *iwork, integer *liwork, @@ -873,11 +873,11 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer indwk, lwmin; logical lower; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -893,14 +893,14 @@ f"> */ real abstll, bignum; integer indtau, indisp, indiwo, indwkn, liwmin; logical tryrac; - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer llwrkn, llwork, nsplit; real smlnum; extern real slansy_(char *, char *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), sstemr_(char *, char *, integer *, @@ -909,7 +909,7 @@ f"> */ , integer *, integer *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormtr_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *), ssytrd_(char *, integer *, real *, integer *, real *, real *, real *, real *, @@ -1012,9 +1012,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYEVR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1022,7 +1022,7 @@ f"> */ *m = 0; if (*n == 0) { work[1] = 1.f; - return 0; + return; } if (*n == 1) { @@ -1041,7 +1041,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1259,7 +1259,7 @@ f"> */ work[1] = (real) lwkopt; iwork[1] = liwmin; - return 0; + return; /* End of SSYEVR */ diff --git a/lapack-netlib/SRC/ssyevr_2stage.c b/lapack-netlib/SRC/ssyevr_2stage.c index 0e21f214cb..e61324786e 100644 --- a/lapack-netlib/SRC/ssyevr_2stage.c +++ b/lapack-netlib/SRC/ssyevr_2stage.c @@ -898,7 +898,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssyevr_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void ssyevr_2stage_(char *jobz, char *range, char *uplo, integer *n, real *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer * ldz, integer *isuppz, real *work, integer *lwork, integer *iwork, @@ -920,12 +920,12 @@ static integer c_n1 = -1; real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer indwk, lhtrd, lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), ssytrd_2stage_(char *, char *, integer *, real *, integer *, real *, real *, real *, real *, integer *, real *, integer *, @@ -943,14 +943,14 @@ static integer c_n1 = -1; real abstll, bignum; integer indtau, indisp, indiwo, indwkn, liwmin; logical tryrac; - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer llwrkn, llwork, nsplit; real smlnum; extern real slansy_(char *, char *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), sstemr_(char *, char *, integer *, @@ -958,7 +958,7 @@ static integer c_n1 = -1; real *, real *, integer *, integer *, integer *, logical *, real * , integer *, integer *, integer *, integer *); logical lquery; - extern /* Subroutine */ int sormtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void sormtr_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); real eps, vll, vuu; @@ -1058,9 +1058,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYEVR_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1068,7 +1068,7 @@ static integer c_n1 = -1; *m = 0; if (*n == 0) { work[1] = 1.f; - return 0; + return; } if (*n == 1) { @@ -1087,7 +1087,7 @@ static integer c_n1 = -1; isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1309,7 +1309,7 @@ static integer c_n1 = -1; work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of SSYEVR_2STAGE */ diff --git a/lapack-netlib/SRC/ssyevx.c b/lapack-netlib/SRC/ssyevx.c index d463ad5885..50fc2cac6d 100644 --- a/lapack-netlib/SRC/ssyevx.c +++ b/lapack-netlib/SRC/ssyevx.c @@ -766,7 +766,7 @@ f"> */ /* > \ingroup realSYeigen */ /* ===================================================================== */ -/* Subroutine */ int ssyevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void ssyevx_(char *jobz, char *range, char *uplo, integer *n, real *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, real * work, integer *lwork, integer *iwork, integer *ifail, integer *info) @@ -785,10 +785,10 @@ f"> */ real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; logical lower; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); logical wantz; @@ -803,23 +803,23 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indtau, indisp, indiwo, indwkn; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk, lwkmin; - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer llwrkn, llwork, nsplit; real smlnum; extern real slansy_(char *, char *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int sorgtr_(char *, integer *, real *, integer *, + extern /* Subroutine */ void sorgtr_(char *, integer *, real *, integer *, real *, real *, integer *, integer *), ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *), sormtr_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, @@ -916,16 +916,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -941,7 +941,7 @@ f"> */ if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -1125,7 +1125,7 @@ f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYEVX */ diff --git a/lapack-netlib/SRC/ssyevx_2stage.c b/lapack-netlib/SRC/ssyevx_2stage.c index 6e461b4183..81c4ab8dd1 100644 --- a/lapack-netlib/SRC/ssyevx_2stage.c +++ b/lapack-netlib/SRC/ssyevx_2stage.c @@ -816,7 +816,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssyevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void ssyevx_2stage_(char *jobz, char *range, char *uplo, integer *n, real *a, integer *lda, real *vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer * ldz, real *work, integer *lwork, integer *iwork, integer *ifail, @@ -838,12 +838,12 @@ static integer c__4 = 4; real sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); char order[1]; integer lhtrd, lwmin; logical lower; integer lwtrd; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), ssytrd_2stage_(char *, char *, integer *, real *, integer *, real *, real *, real *, real *, integer *, real *, integer *, @@ -858,23 +858,23 @@ static integer c__4 = 4; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real abstll, bignum; integer indtau, indisp, indiwo, indwkn; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer indwrk; - extern /* Subroutine */ int sstein_(integer *, real *, real *, integer *, + extern /* Subroutine */ void sstein_(integer *, real *, real *, integer *, real *, integer *, integer *, real *, integer *, real *, integer * , integer *, integer *), ssterf_(integer *, real *, real *, integer *); integer llwrkn, llwork, nsplit; real smlnum; extern real slansy_(char *, char *, integer *, real *, integer *, real *); - extern /* Subroutine */ int sstebz_(char *, char *, integer *, real *, + extern /* Subroutine */ void sstebz_(char *, char *, integer *, real *, real *, integer *, integer *, real *, real *, real *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), sorgtr_(char *, integer *, real *, integer *, real *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int ssteqr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssteqr_(char *, integer *, real *, real *, real *, integer *, real *, integer *), sormtr_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); @@ -972,16 +972,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("SSYEVX_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -997,7 +997,7 @@ static integer c__4 = 4; if (wantz) { z__[z_dim1 + 1] = 1.f; } - return 0; + return; } /* Get machine constants. */ @@ -1184,7 +1184,7 @@ static integer c__4 = 4; work[1] = (real) lwmin; - return 0; + return; /* End of SSYEVX_2STAGE */ diff --git a/lapack-netlib/SRC/ssygs2.c b/lapack-netlib/SRC/ssygs2.c index fe4a01d8b8..dca132458b 100644 --- a/lapack-netlib/SRC/ssygs2.c +++ b/lapack-netlib/SRC/ssygs2.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssygs2_(integer *itype, char *uplo, integer *n, real *a, +/* Subroutine */ void ssygs2_(integer *itype, char *uplo, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -652,12 +652,12 @@ f"> */ /* Local variables */ integer k; - extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), strsv_(char *, char *, char *, integer *, real *, integer *, real @@ -703,7 +703,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYGS2", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -845,7 +845,7 @@ f"> */ } } } - return 0; + return; /* End of SSYGS2 */ diff --git a/lapack-netlib/SRC/ssygst.c b/lapack-netlib/SRC/ssygst.c index 7458a322f2..87d340a890 100644 --- a/lapack-netlib/SRC/ssygst.c +++ b/lapack-netlib/SRC/ssygst.c @@ -645,7 +645,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssygst_(integer *itype, char *uplo, integer *n, real *a, +/* Subroutine */ void ssygst_(integer *itype, char *uplo, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -655,7 +655,7 @@ f"> */ integer k; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssymm_(char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *, real * @@ -663,10 +663,11 @@ f"> */ *, char *, integer *, integer *, real *, real *, integer *, real * , integer *); integer kb, nb; - extern /* Subroutine */ int ssygs2_(integer *, char *, integer *, real *, + extern /* Subroutine */ void ssygs2_(integer *, char *, integer *, real *, integer *, real *, integer *, integer *), ssyr2k_(char *, char *, integer *, integer *, real *, real *, integer *, real *, - integer *, real *, real *, integer *), xerbla_( char *, integer *, ftnlen); + integer *, real *, real *, integer *); + extern int xerbla_( char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -707,13 +708,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -891,7 +892,7 @@ f"> */ } } } - return 0; + return; /* End of SSYGST */ diff --git a/lapack-netlib/SRC/ssygv.c b/lapack-netlib/SRC/ssygv.c index 5fd2085d89..692de8d9cf 100644 --- a/lapack-netlib/SRC/ssygv.c +++ b/lapack-netlib/SRC/ssygv.c @@ -689,7 +689,7 @@ static real c_b16 = 1.f; /* > \ingroup realSYeigen */ /* ===================================================================== */ -/* Subroutine */ int ssygv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void ssygv_(integer *itype, char *jobz, char *uplo, integer * n, real *a, integer *lda, real *b, integer *ldb, real *w, real *work, integer *lwork, integer *info) { @@ -701,11 +701,11 @@ static real c_b16 = 1.f; extern logical lsame_(char *, char *); char trans[1]; logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); logical wantz; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), ssyev_(char *, char *, integer *, real *, integer *, real *, real *, integer *, integer *); @@ -718,7 +718,7 @@ static real c_b16 = 1.f; integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int ssygst_(integer *, char *, integer *, real *, + extern /* Subroutine */ void ssygst_(integer *, char *, integer *, real *, integer *, real *, integer *, integer *); @@ -782,15 +782,15 @@ static real c_b16 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SSYGV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -798,7 +798,7 @@ static real c_b16 = 1.f; spotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -845,7 +845,7 @@ static real c_b16 = 1.f; } work[1] = (real) lwkopt; - return 0; + return; /* End of SSYGV */ diff --git a/lapack-netlib/SRC/ssygv_2stage.c b/lapack-netlib/SRC/ssygv_2stage.c index 251b60be9c..4ddd712006 100644 --- a/lapack-netlib/SRC/ssygv_2stage.c +++ b/lapack-netlib/SRC/ssygv_2stage.c @@ -743,7 +743,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssygv_2stage_(integer *itype, char *jobz, char *uplo, +/* Subroutine */ void ssygv_2stage_(integer *itype, char *jobz, char *uplo, integer *n, real *a, integer *lda, real *b, integer *ldb, real *w, real *work, integer *lwork, integer *info) { @@ -754,25 +754,26 @@ stage.f"> */ integer neig; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); - extern /* Subroutine */ int ssyev_2stage_(char *, char *, integer *, + extern /* Subroutine */ void ssyev_2stage_(char *, char *, integer *, real *, integer *, real *, real *, integer *, integer *); extern logical lsame_(char *, char *); integer lhtrd, lwmin; char trans[1]; logical upper; integer lwtrd; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); logical wantz; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); integer ib, kd; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), spotrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int spotrf_( char *, integer *, real *, integer *, integer *); logical lquery; - extern /* Subroutine */ int ssygst_(integer *, char *, integer *, real *, + extern /* Subroutine */ void ssygst_(integer *, char *, integer *, real *, integer *, real *, integer *, integer *); @@ -837,15 +838,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYGV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -853,7 +854,7 @@ stage.f"> */ spotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -901,7 +902,7 @@ stage.f"> */ } work[1] = (real) lwmin; - return 0; + return; /* End of SSYGV_2STAGE */ diff --git a/lapack-netlib/SRC/ssygvd.c b/lapack-netlib/SRC/ssygvd.c index b5ec9ac84a..d8bbd48e76 100644 --- a/lapack-netlib/SRC/ssygvd.c +++ b/lapack-netlib/SRC/ssygvd.c @@ -739,7 +739,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssygvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void ssygvd_(integer *itype, char *jobz, char *uplo, integer * n, real *a, integer *lda, real *b, integer *ldb, real *w, real *work, integer *lwork, integer *iwork, integer *liwork, integer *info) { @@ -754,20 +754,22 @@ f"> */ char trans[1]; integer liopt; logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); logical wantz; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); integer liwmin; extern /* Subroutine */ int spotrf_(char *, integer *, real *, integer *, - integer *), ssyevd_(char *, char *, integer *, real *, + integer *); + extern void ssyevd_(char *, char *, integer *, real *, integer *, real *, real *, integer *, integer *, integer *, integer *); logical lquery; - extern /* Subroutine */ int ssygst_(integer *, char *, integer *, real *, + extern /* Subroutine */ void ssygst_(integer *, char *, integer *, real *, integer *, real *, integer *, integer *); @@ -841,15 +843,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -857,7 +859,7 @@ f"> */ spotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -909,7 +911,7 @@ f"> */ work[1] = (real) lopt; iwork[1] = liopt; - return 0; + return; /* End of SSYGVD */ diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f index 9002df2374..7c7e0de016 100644 --- a/lapack-netlib/SRC/ssygvd.f +++ b/lapack-netlib/SRC/ssygvd.f @@ -330,8 +330,8 @@ SUBROUTINE SSYGVD( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL SSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK, $ INFO ) - LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) - LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) + LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) ) + LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/ssygvx.c b/lapack-netlib/SRC/ssygvx.c index 355846f31a..f6b45d4bdd 100644 --- a/lapack-netlib/SRC/ssygvx.c +++ b/lapack-netlib/SRC/ssygvx.c @@ -810,7 +810,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int ssygvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void ssygvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, real *a, integer *lda, real *b, integer *ldb, real * vl, real *vu, integer *il, integer *iu, real *abstol, integer *m, real *w, real *z__, integer *ldz, real *work, integer *lwork, integer @@ -823,11 +823,11 @@ f"> */ extern logical lsame_(char *, char *); char trans[1]; logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); logical wantz; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); integer nb; @@ -840,7 +840,7 @@ f"> */ integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int ssygst_(integer *, char *, integer *, real *, + extern /* Subroutine */ void ssygst_(integer *, char *, integer *, real *, integer *, real *, integer *, integer *), ssyevx_(char *, char *, char *, integer *, real *, integer *, real *, real *, integer *, integer *, real *, integer *, real *, real *, integer * @@ -934,16 +934,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYGVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -951,7 +951,7 @@ f"> */ spotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -1002,7 +1002,7 @@ f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYGVX */ diff --git a/lapack-netlib/SRC/ssyrfs.c b/lapack-netlib/SRC/ssyrfs.c index 1fe40a1b30..cf48a77bee 100644 --- a/lapack-netlib/SRC/ssyrfs.c +++ b/lapack-netlib/SRC/ssyrfs.c @@ -705,7 +705,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssyrfs_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void ssyrfs_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real * work, integer *iwork, integer *info) @@ -723,7 +723,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), slacn2_( @@ -735,7 +735,7 @@ f"> */ real safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real lstres; - extern /* Subroutine */ int ssytrs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssytrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); real eps; @@ -791,7 +791,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -803,7 +803,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1000,7 +1000,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of SSYRFS */ diff --git a/lapack-netlib/SRC/ssyrfsx.c b/lapack-netlib/SRC/ssyrfsx.c index e410b1b661..e9caf6b9ef 100644 --- a/lapack-netlib/SRC/ssyrfsx.c +++ b/lapack-netlib/SRC/ssyrfsx.c @@ -811,7 +811,7 @@ static integer c__1 = 1; /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssyrfsx_(char *uplo, char *equed, integer *n, integer * +/* Subroutine */ void ssyrfsx_(char *uplo, char *equed, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, real *s, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *berr, integer *n_err_bnds__, real *err_bnds_norm__, real * @@ -826,7 +826,7 @@ static integer c__1 = 1; /* Local variables */ real illrcond_thresh__, unstable_thresh__, err_lbnd__; - extern /* Subroutine */ int sla_syrfsx_extended_(integer *, char *, + extern /* Subroutine */ void sla_syrfsx_extended_(integer *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *, logical *, real *, real *, integer *, real *, integer * , real *, integer *, real *, real *, real *, real *, real *, real @@ -843,7 +843,7 @@ static integer c__1 = 1; extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slansy_(char *, char *, integer *, real *, integer *, real *); - extern /* Subroutine */ int ssycon_(char *, integer *, real *, integer *, + extern /* Subroutine */ void ssycon_(char *, integer *, real *, integer *, integer *, real *, real *, real *, integer *, integer *); extern integer ilaprec_(char *); integer ithresh, n_norms__; @@ -960,7 +960,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYRFSX", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -983,7 +983,7 @@ static integer c__1 = 1; err_bnds_comp__[j + err_bnds_comp_dim1 * 3] = 1.f; } } - return 0; + return; } /* Default to failure. */ @@ -1124,7 +1124,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of SSYRFSX */ diff --git a/lapack-netlib/SRC/ssysv.c b/lapack-netlib/SRC/ssysv.c index 3e70534315..92d234f51c 100644 --- a/lapack-netlib/SRC/ssysv.c +++ b/lapack-netlib/SRC/ssysv.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \ingroup realSYsolve */ /* ===================================================================== */ -/* Subroutine */ int ssysv_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void ssysv_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, integer *ipiv, real *b, integer *ldb, real *work, integer *lwork, integer *info) { @@ -695,7 +695,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int ssytrf_(char *, integer *, real *, integer *, + extern /* Subroutine */ void ssytrf_(char *, integer *, real *, integer *, integer *, real *, integer *, integer *), ssytrs_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *), ssytrs2_(char *, integer *, @@ -755,9 +755,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYSV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -787,7 +787,7 @@ static integer c_n1 = -1; work[1] = (real) lwkopt; - return 0; + return; /* End of SSYSV */ diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f index 5f4062e9ae..06a42dfb75 100644 --- a/lapack-netlib/SRC/ssysv.f +++ b/lapack-netlib/SRC/ssysv.f @@ -223,7 +223,7 @@ SUBROUTINE SSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysv_aa.c b/lapack-netlib/SRC/ssysv_aa.c index 4e30204d81..c0303e3f4b 100644 --- a/lapack-netlib/SRC/ssysv_aa.c +++ b/lapack-netlib/SRC/ssysv_aa.c @@ -674,7 +674,7 @@ a.f"> */ /* > \ingroup realSYsolve */ /* ===================================================================== */ -/* Subroutine */ int ssysv_aa_(char *uplo, integer *n, integer *nrhs, real * +/* Subroutine */ void ssysv_aa_(char *uplo, integer *n, integer *nrhs, real * a, integer *lda, integer *ipiv, real *b, integer *ldb, real *work, integer *lwork, integer *info) { @@ -684,10 +684,11 @@ a.f"> */ /* Local variables */ extern logical lsame_(char *, char *); integer lwkopt_sytrf__, lwkopt_sytrs__; - extern /* Subroutine */ int ssytrf_aa_(char *, integer *, real *, + extern /* Subroutine */ void ssytrf_aa_(char *, integer *, real *, integer *, integer *, real *, integer *, integer *), ssytrs_aa_(char *, integer *, integer *, real *, integer *, - integer *, real *, integer *, real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, real *, integer *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -748,9 +749,9 @@ a.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYSV_AA", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U**T*T*U or A = L*T*L**T. */ @@ -767,7 +768,7 @@ a.f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYSV_AA */ diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.c b/lapack-netlib/SRC/ssysv_aa_2stage.c index 5113d02113..dbd8402c63 100644 --- a/lapack-netlib/SRC/ssysv_aa_2stage.c +++ b/lapack-netlib/SRC/ssysv_aa_2stage.c @@ -699,7 +699,7 @@ a_2stage.f"> */ /* > \ingroup realSYsolve */ /* ===================================================================== */ -/* Subroutine */ int ssysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void ssysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *tb, integer *ltb, integer *ipiv, integer *ipiv2, real *b, integer *ldb, real *work, integer *lwork, integer * info) @@ -708,7 +708,7 @@ a_2stage.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int ssytrf_aa_2stage_(char *, integer *, real *, + extern /* Subroutine */ void ssytrf_aa_2stage_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *, integer *), ssytrs_aa_2stage_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *, @@ -773,9 +773,9 @@ a_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYSV_AA_2STAGE", &i__1, (ftnlen)15); - return 0; + return; } else if (wquery || tquery) { - return 0; + return; } @@ -794,7 +794,7 @@ a_2stage.f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYSV_AA_2STAGE */ diff --git a/lapack-netlib/SRC/ssysv_rk.c b/lapack-netlib/SRC/ssysv_rk.c index 25a2871836..ca99229011 100644 --- a/lapack-netlib/SRC/ssysv_rk.c +++ b/lapack-netlib/SRC/ssysv_rk.c @@ -740,7 +740,7 @@ k.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssysv_rk_(char *uplo, integer *n, integer *nrhs, real * +/* Subroutine */ void ssysv_rk_(char *uplo, integer *n, integer *nrhs, real * a, integer *lda, real *e, integer *ipiv, real *b, integer *ldb, real * work, integer *lwork, integer *info) { @@ -748,11 +748,12 @@ k.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int ssytrs_3_(char *, integer *, integer *, real + extern /* Subroutine */ void ssytrs_3_(char *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int ssytrf_rk_(char *, integer *, real *, - integer *, real *, integer *, real *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ssytrf_rk_(char *, integer *, real *, + integer *, real *, integer *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -810,9 +811,9 @@ k.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYSV_RK ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = P*U*D*(U**T)*(P**T) or */ @@ -832,7 +833,7 @@ k.f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYSV_RK */ diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f index 9e0487623c..9a7dfa4bb7 100644 --- a/lapack-netlib/SRC/ssysv_rk.f +++ b/lapack-netlib/SRC/ssysv_rk.f @@ -280,7 +280,7 @@ SUBROUTINE SSYSV_RK( UPLO, N, NRHS, A, LDA, E, IPIV, B, LDB, LWKOPT = 1 ELSE CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysv_rook.c b/lapack-netlib/SRC/ssysv_rook.c index 08132197b7..a76639605b 100644 --- a/lapack-netlib/SRC/ssysv_rook.c +++ b/lapack-netlib/SRC/ssysv_rook.c @@ -717,7 +717,7 @@ ook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssysv_rook_(char *uplo, integer *n, integer *nrhs, real +/* Subroutine */ void ssysv_rook_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, integer *ipiv, real *b, integer *ldb, real *work, integer *lwork, integer *info) { @@ -725,7 +725,7 @@ ook.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int ssytrf_rook_(char *, integer *, real *, + extern /* Subroutine */ void ssytrf_rook_(char *, integer *, real *, integer *, integer *, real *, integer *, integer *), ssytrs_rook_(char *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -787,9 +787,9 @@ ook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYSV_ROOK ", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -808,7 +808,7 @@ ook.f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYSV_ROOK */ diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f index b4da1101c8..fb7ba8c53f 100644 --- a/lapack-netlib/SRC/ssysv_rook.f +++ b/lapack-netlib/SRC/ssysv_rook.f @@ -256,7 +256,7 @@ SUBROUTINE SSYSV_ROOK( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = WORK(1) + LWKOPT = INT( WORK( 1 ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/ssysvx.c b/lapack-netlib/SRC/ssysvx.c index 3bbbfcb83a..a6b2d7a1cc 100644 --- a/lapack-netlib/SRC/ssysvx.c +++ b/lapack-netlib/SRC/ssysvx.c @@ -796,7 +796,7 @@ f"> */ /* > \ingroup realSYsolve */ /* ===================================================================== */ -/* Subroutine */ int ssysvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void ssysvx_(char *fact, char *uplo, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *ferr, real *berr, real *work, integer *lwork, integer *iwork, integer * @@ -815,14 +815,14 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); extern real slansy_(char *, char *, integer *, real *, integer *, real *); - extern /* Subroutine */ int ssycon_(char *, integer *, real *, integer *, + extern /* Subroutine */ void ssycon_(char *, integer *, real *, integer *, integer *, real *, real *, real *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int ssyrfs_(char *, integer *, integer *, real *, + extern /* Subroutine */ void ssyrfs_(char *, integer *, integer *, real *, integer *, real *, integer *, integer *, real *, integer *, real * , integer *, real *, real *, real *, integer *, integer *) , ssytrf_(char *, integer *, real *, integer *, integer *, real *, @@ -907,9 +907,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYSVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } if (nofact) { @@ -924,7 +924,7 @@ f"> */ if (*info > 0) { *rcond = 0.f; - return 0; + return; } } @@ -958,7 +958,7 @@ f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of SSYSVX */ diff --git a/lapack-netlib/SRC/ssysvxx.c b/lapack-netlib/SRC/ssysvxx.c index 1a95aafd4d..46cfecb643 100644 --- a/lapack-netlib/SRC/ssysvxx.c +++ b/lapack-netlib/SRC/ssysvxx.c @@ -1014,7 +1014,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup realSYsolve */ /* ===================================================================== */ -/* Subroutine */ int ssysvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void ssysvxx_(char *fact, char *uplo, integer *n, integer * nrhs, real *a, integer *lda, real *af, integer *ldaf, integer *ipiv, char *equed, real *s, real *b, integer *ldb, real *x, integer *ldx, real *rcond, real *rpvgrw, real *berr, integer *n_err_bnds__, real * @@ -1028,7 +1028,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ real r__1, r__2; /* Local variables */ - extern /* Subroutine */ int ssyrfsx_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssyrfsx_(char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *, real *, real *, integer *, real *, real *, integer *, integer *); @@ -1044,10 +1044,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; integer infequ; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); real smlnum; - extern /* Subroutine */ int slaqsy_(char *, integer *, real *, integer *, + extern /* Subroutine */ void slaqsy_(char *, integer *, real *, integer *, real *, real *, real *, char *), ssytrf_(char *, integer *, real *, integer *, integer *, real *, integer *, integer *), slascl2_(integer *, integer *, real *, real *, @@ -1163,7 +1163,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("SSYSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1208,7 +1208,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = sla_syrpvgrw_(uplo, n, info, &a[a_offset], lda, & af[af_offset], ldaf, &ipiv[1], &work[1]); } - return 0; + return; } } @@ -1240,7 +1240,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ slascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of SSYSVXX */ diff --git a/lapack-netlib/SRC/ssyswapr.c b/lapack-netlib/SRC/ssyswapr.c index d613916a4c..532d2b4b53 100644 --- a/lapack-netlib/SRC/ssyswapr.c +++ b/lapack-netlib/SRC/ssyswapr.c @@ -616,7 +616,7 @@ r.f"> */ /* > \ingroup realSYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int ssyswapr_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssyswapr_(char *uplo, integer *n, real *a, integer *lda, integer *i1, integer *i2) { /* System generated locals */ @@ -626,7 +626,7 @@ r.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real tmp; @@ -712,6 +712,6 @@ r.f"> */ } } - return 0; + return; } /* ssyswapr_ */ diff --git a/lapack-netlib/SRC/ssyswapr.f b/lapack-netlib/SRC/ssyswapr.f index 5e4265d7a8..e1ab5a22ac 100644 --- a/lapack-netlib/SRC/ssyswapr.f +++ b/lapack-netlib/SRC/ssyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is REAL array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by SSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is REAL array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ SUBROUTINE SSYSWAPR( UPLO, N, A, LDA, I1, I2) INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - REAL A( LDA, N ) + REAL A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I REAL TMP * * .. External Functions .. @@ -143,19 +140,12 @@ SUBROUTINE SSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL SSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL SSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ SUBROUTINE SSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL SSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL SSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE SSYSWAPR diff --git a/lapack-netlib/SRC/ssytd2.c b/lapack-netlib/SRC/ssytd2.c index 3926db4a85..bb0046680a 100644 --- a/lapack-netlib/SRC/ssytd2.c +++ b/lapack-netlib/SRC/ssytd2.c @@ -689,7 +689,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssytd2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytd2_(char *uplo, integer *n, real *a, integer *lda, real *d__, real *e, real *tau, integer *info) { /* System generated locals */ @@ -699,16 +699,16 @@ f"> */ real taui; extern real sdot_(integer *, real *, integer *, real *, integer *); integer i__; - extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real alpha; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), ssymv_(char *, integer *, real *, real *, - integer *, real *, integer *, real *, real *, integer *), - xerbla_(char *, integer *, ftnlen), slarfg_(integer *, real *, - real *, integer *, real *); + integer *, real *, integer *, real *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slarfg_(integer *, real *, real *, integer *, real *); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -743,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTD2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -852,7 +852,7 @@ f"> */ d__[*n] = a[*n + *n * a_dim1]; } - return 0; + return; /* End of SSYTD2 */ diff --git a/lapack-netlib/SRC/ssytf2.c b/lapack-netlib/SRC/ssytf2.c index 1026e58d91..7347525c9a 100644 --- a/lapack-netlib/SRC/ssytf2.c +++ b/lapack-netlib/SRC/ssytf2.c @@ -709,7 +709,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytf2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytf2_(char *uplo, integer *n, real *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -718,15 +718,15 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int ssyr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr_(char *, integer *, real *, real *, integer *, real *, integer *); integer i__, j, k; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real r1, d11, d12, d21, d22; integer kk, kp; @@ -768,7 +768,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1169,7 +1169,7 @@ f"> */ L70: - return 0; + return; /* End of SSYTF2 */ diff --git a/lapack-netlib/SRC/ssytf2_rk.c b/lapack-netlib/SRC/ssytf2_rk.c index 4ce53fd50e..9cd7f07d89 100644 --- a/lapack-netlib/SRC/ssytf2_rk.c +++ b/lapack-netlib/SRC/ssytf2_rk.c @@ -755,7 +755,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytf2_rk_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytf2_rk_(char *uplo, integer *n, real *a, integer * lda, real *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -765,17 +765,17 @@ rk.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int ssyr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr_(char *, integer *, real *, real *, integer *, real *, integer *); integer i__, j, k, p; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real sfmin; integer itemp, kstep; real stemp; logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real d11, d12, d21, d22; integer ii, kk, kp; @@ -817,7 +817,7 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTF2_RK", &i__1, (ftnlen)9); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1532,7 +1532,7 @@ rk.f"> */ ; } - return 0; + return; /* End of SSYTF2_RK */ diff --git a/lapack-netlib/SRC/ssytf2_rook.c b/lapack-netlib/SRC/ssytf2_rook.c index eb6ab57d67..ce244ceb55 100644 --- a/lapack-netlib/SRC/ssytf2_rook.c +++ b/lapack-netlib/SRC/ssytf2_rook.c @@ -708,7 +708,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytf2_rook_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytf2_rook_(char *uplo, integer *n, real *a, integer * lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -718,17 +718,17 @@ rook.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int ssyr_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr_(char *, integer *, real *, real *, integer *, real *, integer *); integer i__, j, k, p; real t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real sfmin; integer itemp, kstep; real stemp; logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real d11, d12, d21, d22; integer ii, kk, kp; @@ -769,7 +769,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTF2_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1388,7 +1388,7 @@ rook.f"> */ L70: - return 0; + return; /* End of SSYTF2_ROOK */ diff --git a/lapack-netlib/SRC/ssytrd.c b/lapack-netlib/SRC/ssytrd.c index d491e6cc14..f2b2130035 100644 --- a/lapack-netlib/SRC/ssytrd.c +++ b/lapack-netlib/SRC/ssytrd.c @@ -710,7 +710,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssytrd_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytrd_(char *uplo, integer *n, real *a, integer *lda, real *d__, real *e, real *tau, real *work, integer *lwork, integer * info) { @@ -723,7 +723,7 @@ f"> */ integer nbmin, iinfo; logical upper; integer nb, kk; - extern /* Subroutine */ int ssytd2_(char *, integer *, real *, integer *, + extern /* Subroutine */ void ssytd2_(char *, integer *, real *, integer *, real *, real *, real *, integer *), ssyr2k_(char *, char * , integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); @@ -731,7 +731,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slatrd_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slatrd_(char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *); integer ldwork, lwkopt; logical lquery; @@ -785,16 +785,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRD", &i__1,(ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.f; - return 0; + return; } nx = *n; @@ -921,7 +921,7 @@ f"> */ } work[1] = (real) lwkopt; - return 0; + return; /* End of SSYTRD */ diff --git a/lapack-netlib/SRC/ssytrd_2stage.c b/lapack-netlib/SRC/ssytrd_2stage.c index 35bbb6c7c6..2242619b59 100644 --- a/lapack-netlib/SRC/ssytrd_2stage.c +++ b/lapack-netlib/SRC/ssytrd_2stage.c @@ -740,7 +740,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssytrd_2stage_(char *vect, char *uplo, integer *n, real +/* Subroutine */ void ssytrd_2stage_(char *vect, char *uplo, integer *n, real *a, integer *lda, real *d__, real *e, real *tau, real *hous2, integer *lhous2, real *work, integer *lwork, integer *info) { @@ -752,7 +752,7 @@ static integer c__4 = 4; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); integer lwrk, wpos; - extern /* Subroutine */ int ssytrd_sb2st_(char *, char *, char *, + extern /* Subroutine */ void ssytrd_sb2st_(char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *, real *, integer *, integer *), ssytrd_sy2sb_(char *, integer *, integer *, real *, integer *, @@ -823,16 +823,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1] = 1.f; - return 0; + return; } /* Determine pointer position */ @@ -846,20 +846,20 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRD_SY2SB", &i__1, (ftnlen)12); - return 0; + return; } ssytrd_sb2st_("Y", vect, uplo, n, &kd, &work[abpos], &ldab, &d__[1], &e[ 1], &hous2[1], lhous2, &work[wpos], &lwrk, info); if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRD_SB2ST", &i__1, (ftnlen)12); - return 0; + return; } hous2[1] = (real) lhmin; work[1] = (real) lwmin; - return 0; + return; /* End of SSYTRD_2STAGE */ diff --git a/lapack-netlib/SRC/ssytrd_sb2st.c b/lapack-netlib/SRC/ssytrd_sb2st.c index 83dfe6954b..c621673c62 100644 --- a/lapack-netlib/SRC/ssytrd_sb2st.c +++ b/lapack-netlib/SRC/ssytrd_sb2st.c @@ -746,7 +746,7 @@ sb2t.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssytrd_sb2st_(char *stage1, char *vect, char *uplo, +/* Subroutine */ void ssytrd_sb2st_(char *stage1, char *vect, char *uplo, integer *n, integer *kd, real *ab, integer *ldab, real *d__, real *e, real *hous, integer *lhous, real *work, integer *lwork, integer *info) { @@ -766,14 +766,14 @@ sb2t.f"> */ integer sisev, grsiz, ttype, stepercol, ed, ib, st, abdpos; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer thgrid, thgrnb, indtau, ofdpos; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), ssb2st_kernels_(char *, logical *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *); integer blklastind; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery, afters1; integer lda, tid, ldv, stt, sweepid, nbtiles, sizetau, thgrsiz; @@ -841,9 +841,9 @@ sb2t.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRD_SB2ST", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -851,7 +851,7 @@ sb2t.f"> */ if (*n == 0) { hous[1] = 1.f; work[1] = 1.f; - return 0; + return; } /* Determine pointer position */ @@ -904,7 +904,7 @@ sb2t.f"> */ hous[1] = 1.f; work[1] = 1.f; - return 0; + return; } /* Case KD=1: */ @@ -940,7 +940,7 @@ sb2t.f"> */ hous[1] = 1.f; work[1] = 1.f; - return 0; + return; } /* Main code start here. */ @@ -1059,7 +1059,7 @@ sb2t.f"> */ hous[1] = (real) lhmin; work[1] = (real) lwmin; - return 0; + return; /* End of SSYTRD_SB2ST */ diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.c b/lapack-netlib/SRC/ssytrd_sy2sb.c index 2fcd3c688e..096c0aea67 100644 --- a/lapack-netlib/SRC/ssytrd_sy2sb.c +++ b/lapack-netlib/SRC/ssytrd_sy2sb.c @@ -761,7 +761,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssytrd_sy2sb_(char *uplo, integer *n, integer *kd, real +/* Subroutine */ void ssytrd_sy2sb_(char *uplo, integer *n, integer *kd, real *a, integer *lda, real *ab, integer *ldab, real *tau, real *work, integer *lwork, integer *info) { @@ -775,20 +775,21 @@ f"> */ integer tpos, wpos, s1pos, s2pos, i__, j; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lwmin; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), ssymm_(char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lk, pk; - extern /* Subroutine */ int ssyr2k_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ssyr2k_(char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer pn, lt, lw; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgelqf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgelqf_( integer *, integer *, real *, integer *, real *, real *, integer * , integer *), sgeqrf_(integer *, integer *, real *, integer *, real *, real *, integer *, integer *), slarft_(char *, char *, @@ -849,10 +850,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRD_SY2SB", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { work[1] = (real) lwmin; - return 0; + return; } /* Quick return if possible */ @@ -881,7 +882,7 @@ f"> */ } } work[1] = 1.f; - return 0; + return; } /* Determine the pointer position for the workspace */ @@ -1073,7 +1074,7 @@ f"> */ } work[1] = (real) lwmin; - return 0; + return; /* End of SSYTRD_SY2SB */ diff --git a/lapack-netlib/SRC/ssytrf.c b/lapack-netlib/SRC/ssytrf.c index 0b59841455..b69d20d79a 100644 --- a/lapack-netlib/SRC/ssytrf.c +++ b/lapack-netlib/SRC/ssytrf.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ssytrf_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytrf_(char *uplo, integer *n, real *a, integer *lda, integer *ipiv, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -709,11 +709,12 @@ f"> */ integer nbmin, iinfo; logical upper; integer kb, nb; - extern /* Subroutine */ int ssytf2_(char *, integer *, real *, integer *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ssytf2_(char *, integer *, real *, integer *, + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slasyf_(char *, integer *, integer *, integer + extern /* Subroutine */ void slasyf_(char *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); integer ldwork, lwkopt; logical lquery; @@ -765,9 +766,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -894,7 +895,7 @@ f"> */ L40: work[1] = (real) lwkopt; - return 0; + return; /* End of SSYTRF */ diff --git a/lapack-netlib/SRC/ssytrf_aa.c b/lapack-netlib/SRC/ssytrf_aa.c index f331fb373e..f66e03453f 100644 --- a/lapack-netlib/SRC/ssytrf_aa.c +++ b/lapack-netlib/SRC/ssytrf_aa.c @@ -648,7 +648,7 @@ aa.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytrf_aa_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytrf_aa_(char *uplo, integer *n, real *a, integer * lda, integer *ipiv, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -658,7 +658,7 @@ aa.f"> */ integer j; real alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), slasyf_aa_(char *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, real * @@ -666,7 +666,7 @@ aa.f"> */ integer *, real *, integer *, real *, real *, integer *); logical upper; integer k1, k2, j1, j2, j3; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ); integer jb, nb, mj, nj; @@ -727,19 +727,19 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRF_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } ipiv[1] = 1; if (*n == 1) { - return 0; + return; } /* Adjust block size based on the workspace size */ @@ -1027,7 +1027,7 @@ aa.f"> */ } L20: - return 0; + return; /* End of SSYTRF_AA */ diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.c b/lapack-netlib/SRC/ssytrf_aa_2stage.c index 4f25c02bb1..2aa0a9d231 100644 --- a/lapack-netlib/SRC/ssytrf_aa_2stage.c +++ b/lapack-netlib/SRC/ssytrf_aa_2stage.c @@ -676,7 +676,7 @@ aa_2stage.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytrf_aa_2stage_(char *uplo, integer *n, real *a, +/* Subroutine */ void ssytrf_aa_2stage_(char *uplo, integer *n, real *a, integer *lda, real *tb, integer *ltb, integer *ipiv, integer *ipiv2, real *work, integer *lwork, integer *info) { @@ -687,13 +687,13 @@ aa_2stage.f"> */ integer ldtb, i__, j, k; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer i1; logical upper; integer i2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); @@ -701,14 +701,15 @@ aa_2stage.f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int sgbtrf_(integer *, integer *, integer *, - integer *, real *, integer *, integer *, integer *), sgetrf_( - integer *, integer *, real *, integer *, integer *, integer *), - slacpy_(char *, integer *, integer *, real *, integer *, real *, + extern /* Subroutine */ void sgbtrf_(integer *, integer *, integer *, + integer *, real *, integer *, integer *, integer *); + extern int sgetrf_( + integer *, integer *, real *, integer *, integer *, integer *); + extern void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); logical tquery, wquery; - extern /* Subroutine */ int ssygst_(integer *, char *, integer *, real *, + extern /* Subroutine */ void ssygst_(integer *, char *, integer *, real *, integer *, real *, integer *, integer *); real piv; @@ -754,7 +755,7 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRF_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Answer the query */ @@ -770,13 +771,13 @@ aa_2stage.f"> */ } } if (tquery || wquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } /* Determine the number of the block size */ @@ -1231,7 +1232,7 @@ aa_2stage.f"> */ /* Factor the band matrix */ sgbtrf_(n, n, &nb, &nb, &tb[1], &ldtb, &ipiv2[1], info); - return 0; + return; /* End of SSYTRF_AA_2STAGE */ diff --git a/lapack-netlib/SRC/ssytrf_rk.c b/lapack-netlib/SRC/ssytrf_rk.c index 93db39123e..1eb83fb314 100644 --- a/lapack-netlib/SRC/ssytrf_rk.c +++ b/lapack-netlib/SRC/ssytrf_rk.c @@ -774,7 +774,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytrf_rk_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytrf_rk_(char *uplo, integer *n, real *a, integer * lda, real *e, integer *ipiv, real *work, integer *lwork, integer * info) { @@ -785,10 +785,10 @@ rk.f"> */ integer i__, k; extern logical lsame_(char *, char *); integer nbmin, iinfo; - extern /* Subroutine */ int ssytf2_rk_(char *, integer *, real *, + extern /* Subroutine */ void ssytf2_rk_(char *, integer *, real *, integer *, real *, integer *, integer *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), slasyf_rk_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, integer * ); @@ -847,9 +847,9 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRF_RK", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -1035,7 +1035,7 @@ rk.f"> */ } work[1] = (real) lwkopt; - return 0; + return; /* End of SSYTRF_RK */ diff --git a/lapack-netlib/SRC/ssytrf_rook.c b/lapack-netlib/SRC/ssytrf_rook.c index e0818bb808..1091438e4e 100644 --- a/lapack-netlib/SRC/ssytrf_rook.c +++ b/lapack-netlib/SRC/ssytrf_rook.c @@ -723,7 +723,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytrf_rook_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytrf_rook_(char *uplo, integer *n, real *a, integer * lda, integer *ipiv, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -741,7 +741,7 @@ rook.f"> */ integer ldwork, lwkopt; logical lquery; integer iws; - extern /* Subroutine */ int ssytf2_rook_(char *, integer *, real *, + extern /* Subroutine */ void ssytf2_rook_(char *, integer *, real *, integer *, integer *, integer *), slasyf_rook_(char *, integer *, integer *, integer *, real *, integer *, integer *, real *, integer *, integer *); @@ -794,9 +794,9 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRF_ROOK", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -926,7 +926,7 @@ rook.f"> */ L40: work[1] = (real) lwkopt; - return 0; + return; /* End of SSYTRF_ROOK */ diff --git a/lapack-netlib/SRC/ssytri.c b/lapack-netlib/SRC/ssytri.c index fc59afecf7..33ef9d5815 100644 --- a/lapack-netlib/SRC/ssytri.c +++ b/lapack-netlib/SRC/ssytri.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytri_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytri_(char *uplo, integer *n, real *a, integer *lda, integer *ipiv, real *work, integer *info) { /* System generated locals */ @@ -645,7 +645,7 @@ f"> */ extern logical lsame_(char *, char *); integer kstep; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); @@ -686,13 +686,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -703,7 +703,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -714,7 +714,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } /* L20: */ } @@ -939,7 +939,7 @@ f"> */ ; } - return 0; + return; /* End of SSYTRI */ diff --git a/lapack-netlib/SRC/ssytri2.c b/lapack-netlib/SRC/ssytri2.c index 1c0ce40327..1ab96b18e7 100644 --- a/lapack-netlib/SRC/ssytri2.c +++ b/lapack-netlib/SRC/ssytri2.c @@ -641,14 +641,14 @@ static integer c_n1 = -1; /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytri2_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytri2_(char *uplo, integer *n, real *a, integer *lda, integer *ipiv, real *work, integer *lwork, integer *info) { /* System generated locals */ integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int ssytri2x_(char *, integer *, real *, integer * + extern /* Subroutine */ void ssytri2x_(char *, integer *, real *, integer * , integer *, real *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmax; @@ -657,7 +657,7 @@ static integer c_n1 = -1; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical lquery; - extern /* Subroutine */ int ssytri_(char *, integer *, real *, integer *, + extern /* Subroutine */ void ssytri_(char *, integer *, real *, integer *, integer *, real *, integer *); integer minsize; @@ -709,13 +709,13 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRI2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { work[1] = (real) minsize; - return 0; + return; } if (*n == 0) { - return 0; + return; } if (nbmax >= *n) { ssytri_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], info); @@ -723,7 +723,7 @@ static integer c_n1 = -1; ssytri2x_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], &nbmax, info); } - return 0; + return; /* End of SSYTRI2 */ diff --git a/lapack-netlib/SRC/ssytri2x.c b/lapack-netlib/SRC/ssytri2x.c index 31d25c85ff..0caf344c58 100644 --- a/lapack-netlib/SRC/ssytri2x.c +++ b/lapack-netlib/SRC/ssytri2x.c @@ -634,7 +634,7 @@ x.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytri2x_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytri2x_(char *uplo, integer *n, real *a, integer *lda, integer *ipiv, real *work, integer *nb, integer *info) { /* System generated locals */ @@ -644,28 +644,29 @@ x.f"> */ integer invd; real akkp1, d__; integer i__, j, k; - extern /* Subroutine */ int ssyswapr_(char *, integer *, real *, integer * + extern /* Subroutine */ void ssyswapr_(char *, integer *, real *, integer * , integer *, integer *); real t; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer count; logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); real ak, u01_i_j__; integer u11; real u11_i_j__; integer ip; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), strtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int strtri_( char *, char *, integer *, real *, integer *, integer *); integer nnb, cut; real akp1, u01_ip1_j__, u11_ip1_j__; - extern /* Subroutine */ int ssyconv_(char *, char *, integer *, real *, + extern /* Subroutine */ void ssyconv_(char *, char *, integer *, real *, integer *, integer *, real *, integer *); @@ -706,10 +707,10 @@ x.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRI2X", &i__1, (ftnlen)8); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Convert A */ @@ -726,7 +727,7 @@ x.f"> */ for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } } } else { @@ -736,7 +737,7 @@ x.f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } } } @@ -1208,7 +1209,7 @@ x.f"> */ } } - return 0; + return; /* End of SSYTRI2X */ diff --git a/lapack-netlib/SRC/ssytri_3.c b/lapack-netlib/SRC/ssytri_3.c index e1a0a39e85..0b393594c4 100644 --- a/lapack-netlib/SRC/ssytri_3.c +++ b/lapack-netlib/SRC/ssytri_3.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytri_3_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void ssytri_3_(char *uplo, integer *n, real *a, integer *lda, real *e, integer *ipiv, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -691,7 +691,7 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ssytri_3x_(char *, integer *, real *, + extern /* Subroutine */ void ssytri_3x_(char *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *); logical upper; integer nb; @@ -747,16 +747,16 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRI_3", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1] = (real) lwkopt; - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } ssytri_3x_(uplo, n, &a[a_offset], lda, &e[1], &ipiv[1], &work[1], &nb, @@ -764,7 +764,7 @@ static integer c_n1 = -1; work[1] = (real) lwkopt; - return 0; + return; /* End of SSYTRI_3 */ diff --git a/lapack-netlib/SRC/ssytri_3x.c b/lapack-netlib/SRC/ssytri_3x.c index 99378323cb..e86c6e03ec 100644 --- a/lapack-netlib/SRC/ssytri_3x.c +++ b/lapack-netlib/SRC/ssytri_3x.c @@ -673,7 +673,7 @@ static real c_b14 = 0.f; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytri_3x_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytri_3x_(char *uplo, integer *n, real *a, integer * lda, real *e, integer *ipiv, real *work, integer *nb, integer *info) { /* System generated locals */ @@ -683,15 +683,15 @@ static real c_b14 = 0.f; integer invd; real akkp1, d__; integer i__, j, k; - extern /* Subroutine */ int ssyswapr_(char *, integer *, real *, integer * + extern /* Subroutine */ void ssyswapr_(char *, integer *, real *, integer * , integer *, integer *); real t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); real ak, u01_i_j__; @@ -743,10 +743,10 @@ static real c_b14 = 0.f; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRI_3X", &i__1, (ftnlen)9); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Workspace got Non-diag elements of D */ @@ -764,7 +764,7 @@ static real c_b14 = 0.f; for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } } } else { @@ -774,7 +774,7 @@ static real c_b14 = 0.f; i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } } } @@ -1257,7 +1257,7 @@ static real c_b14 = 0.f; } - return 0; + return; /* End of SSYTRI_3X */ diff --git a/lapack-netlib/SRC/ssytri_rook.c b/lapack-netlib/SRC/ssytri_rook.c index 36b8a5501f..80a68dc1d1 100644 --- a/lapack-netlib/SRC/ssytri_rook.c +++ b/lapack-netlib/SRC/ssytri_rook.c @@ -644,7 +644,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytri_rook_(char *uplo, integer *n, real *a, integer * +/* Subroutine */ void ssytri_rook_(char *uplo, integer *n, real *a, integer * lda, integer *ipiv, real *work, integer *info) { /* System generated locals */ @@ -660,7 +660,7 @@ rook.f"> */ extern logical lsame_(char *, char *); integer kstep; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), sswap_(integer *, real *, integer *, real *, integer * ), ssymv_(char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); @@ -701,13 +701,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRI_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -718,7 +718,7 @@ rook.f"> */ for (*info = *n; *info >= 1; --(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -729,7 +729,7 @@ rook.f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ipiv[*info] > 0 && a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } /* L20: */ } @@ -1028,7 +1028,7 @@ rook.f"> */ ; } - return 0; + return; /* End of SSYTRI_ROOK */ diff --git a/lapack-netlib/SRC/ssytrs.c b/lapack-netlib/SRC/ssytrs.c index 13c79835b3..132afb659d 100644 --- a/lapack-netlib/SRC/ssytrs.c +++ b/lapack-netlib/SRC/ssytrs.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytrs_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void ssytrs_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, integer *ipiv, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -643,17 +643,17 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real akm1k; integer j, k; extern logical lsame_(char *, char *); real denom; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real ak, bk; integer kp; @@ -696,13 +696,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1000,7 +1000,7 @@ f"> */ ; } - return 0; + return; /* End of SSYTRS */ diff --git a/lapack-netlib/SRC/ssytrs2.c b/lapack-netlib/SRC/ssytrs2.c index dddf9841e2..0ed7266936 100644 --- a/lapack-netlib/SRC/ssytrs2.c +++ b/lapack-netlib/SRC/ssytrs2.c @@ -644,7 +644,7 @@ static real c_b10 = 1.f; /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytrs2_(char *uplo, integer *n, integer *nrhs, real *a, +/* Subroutine */ void ssytrs2_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, integer *ipiv, real *b, integer *ldb, real *work, integer *info) { @@ -658,16 +658,16 @@ static real c_b10 = 1.f; extern logical lsame_(char *, char *); real denom; integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); real ak, bk; integer kp; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real akm1, bkm1; - extern /* Subroutine */ int ssyconv_(char *, char *, integer *, real *, + extern /* Subroutine */ void ssyconv_(char *, char *, integer *, real *, integer *, integer *, real *, integer *); @@ -707,13 +707,13 @@ static real c_b10 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRS2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Convert A */ @@ -898,7 +898,7 @@ static real c_b10 = 1.f; ssyconv_(uplo, "R", n, &a[a_offset], lda, &ipiv[1], &work[1], &iinfo); - return 0; + return; /* End of SSYTRS2 */ diff --git a/lapack-netlib/SRC/ssytrs_3.c b/lapack-netlib/SRC/ssytrs_3.c index f93908c363..a4e5b1d093 100644 --- a/lapack-netlib/SRC/ssytrs_3.c +++ b/lapack-netlib/SRC/ssytrs_3.c @@ -677,7 +677,7 @@ static real c_b9 = 1.f; /* > \endverbatim */ /* ==================================================================== */ -/* Subroutine */ int ssytrs_3_(char *uplo, integer *n, integer *nrhs, real * +/* Subroutine */ void ssytrs_3_(char *uplo, integer *n, integer *nrhs, real * a, integer *lda, real *e, integer *ipiv, real *b, integer *ldb, integer *info) { @@ -690,9 +690,9 @@ static real c_b9 = 1.f; integer i__, j, k; extern logical lsame_(char *, char *); real denom; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); real ak, bk; @@ -737,13 +737,13 @@ static real c_b9 = 1.f; if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRS_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -895,7 +895,7 @@ static real c_b9 = 1.f; } - return 0; + return; /* End of SSYTRS_3 */ diff --git a/lapack-netlib/SRC/ssytrs_aa.c b/lapack-netlib/SRC/ssytrs_aa.c index 52494a0771..8dfac7ad56 100644 --- a/lapack-netlib/SRC/ssytrs_aa.c +++ b/lapack-netlib/SRC/ssytrs_aa.c @@ -644,7 +644,7 @@ aa.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytrs_aa_(char *uplo, integer *n, integer *nrhs, real * +/* Subroutine */ void ssytrs_aa_(char *uplo, integer *n, integer *nrhs, real * a, integer *lda, integer *ipiv, real *b, integer *ldb, real *work, integer *lwork, integer *info) { @@ -655,13 +655,14 @@ aa.f"> */ integer k; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *), sgtsv_(integer *, integer *, real *, real *, real *, real *, integer *, integer *), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ); integer lwkopt; @@ -712,17 +713,17 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRS_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { lwkopt = *n * 3 - 2; work[1] = (real) lwkopt; - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -861,7 +862,7 @@ aa.f"> */ } - return 0; + return; /* End of SSYTRS_AA */ diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.c b/lapack-netlib/SRC/ssytrs_aa_2stage.c index 3e08626e7d..b9ef411706 100644 --- a/lapack-netlib/SRC/ssytrs_aa_2stage.c +++ b/lapack-netlib/SRC/ssytrs_aa_2stage.c @@ -653,7 +653,7 @@ aa_2stage.f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ssytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void ssytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, real *tb, integer *ltb, integer *ipiv, integer *ipiv2, real *b, integer *ldb, integer *info) { @@ -664,14 +664,15 @@ aa_2stage.f"> */ integer ldtb; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); integer nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), sgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void sgbtrs_( char *, integer *, integer *, integer *, integer *, real *, - integer *, integer *, real *, integer *, integer *), - slaswp_(integer *, real *, integer *, integer *, integer *, + integer *, integer *, real *, integer *, integer *); + extern int slaswp_(integer *, real *, integer *, integer *, integer *, integer *, integer *); @@ -715,13 +716,13 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRS_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Read NB and compute LDTB */ @@ -806,7 +807,7 @@ aa_2stage.f"> */ } } - return 0; + return; /* End of SSYTRS_AA_2STAGE */ diff --git a/lapack-netlib/SRC/ssytrs_rook.c b/lapack-netlib/SRC/ssytrs_rook.c index 5c1e201f37..bd6926b0b3 100644 --- a/lapack-netlib/SRC/ssytrs_rook.c +++ b/lapack-netlib/SRC/ssytrs_rook.c @@ -650,7 +650,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int ssytrs_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void ssytrs_rook_(char *uplo, integer *n, integer *nrhs, real *a, integer *lda, integer *ipiv, real *b, integer *ldb, integer * info) { @@ -659,17 +659,17 @@ rook.f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real akm1k; integer j, k; extern logical lsame_(char *, char *); real denom; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, + extern /* Subroutine */ void sswap_(integer *, real *, integer *, real *, integer *); real ak, bk; integer kp; @@ -712,13 +712,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("SSYTRS_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1044,7 +1044,7 @@ rook.f"> */ ; } - return 0; + return; /* End of SSYTRS_ROOK */ diff --git a/lapack-netlib/SRC/stbcon.c b/lapack-netlib/SRC/stbcon.c index ab2f283c9f..6edc905b1d 100644 --- a/lapack-netlib/SRC/stbcon.c +++ b/lapack-netlib/SRC/stbcon.c @@ -655,7 +655,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stbcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void stbcon_(char *norm, char *uplo, char *diag, integer *n, integer *kd, real *ab, integer *ldab, real *rcond, real *work, integer *iwork, integer *info) { @@ -669,10 +669,10 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); logical upper; real xnorm; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ix; extern real slamch_(char *); @@ -681,7 +681,7 @@ f"> */ extern real slantb_(char *, char *, char *, integer *, integer *, real *, integer *, real *); real ainvnm; - extern /* Subroutine */ int slatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatbs_(char *, char *, char *, char *, integer *, integer *, real *, integer *, real *, real *, real *, integer *); logical onenrm; @@ -730,14 +730,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.f; - return 0; + return; } *rcond = 0.f; @@ -802,7 +802,7 @@ f"> */ } L20: - return 0; + return; /* End of STBCON */ diff --git a/lapack-netlib/SRC/stbrfs.c b/lapack-netlib/SRC/stbrfs.c index 5834f65c1b..bc8a71c15d 100644 --- a/lapack-netlib/SRC/stbrfs.c +++ b/lapack-netlib/SRC/stbrfs.c @@ -701,7 +701,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stbrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void stbrfs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, real *ab, integer *ldab, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) @@ -719,7 +719,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), stbmv_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *), stbsv_(char *, char *, char *, integer *, integer *, real *, @@ -793,7 +793,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -805,7 +805,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1091,7 +1091,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of STBRFS */ diff --git a/lapack-netlib/SRC/stbtrs.c b/lapack-netlib/SRC/stbtrs.c index 2f9ad2ba47..916c59897a 100644 --- a/lapack-netlib/SRC/stbtrs.c +++ b/lapack-netlib/SRC/stbtrs.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stbtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void stbtrs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, real *ab, integer *ldab, real *b, integer *ldb, integer *info) { @@ -669,8 +669,9 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int stbsv_(char *, char *, char *, integer *, - integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void stbsv_(char *, char *, char *, integer *, + integer *, real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; @@ -718,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -734,7 +735,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ab[*kd + 1 + *info * ab_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -742,7 +743,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ab[*info * ab_dim1 + 1] == 0.f) { - return 0; + return; } /* L20: */ } @@ -759,7 +760,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of STBTRS */ diff --git a/lapack-netlib/SRC/stfsm.c b/lapack-netlib/SRC/stfsm.c index 97d548d650..8500403487 100644 --- a/lapack-netlib/SRC/stfsm.c +++ b/lapack-netlib/SRC/stfsm.c @@ -790,7 +790,7 @@ static real c_b27 = 1.f; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int stfsm_(char *transr, char *side, char *uplo, char *trans, +/* Subroutine */ void stfsm_(char *transr, char *side, char *uplo, char *trans, char *diag, integer *m, integer *n, real *alpha, real *a, real *b, integer *ldb) { @@ -801,14 +801,15 @@ static real c_b27 = 1.f; integer info, i__, j, k; logical normaltransr, lside; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical lower; integer m1, m2, n1, n2; - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical misodd, nisodd, notrans; @@ -855,13 +856,13 @@ static real c_b27 = 1.f; if (info != 0) { i__1 = -info; xerbla_("STFSM ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return when ( (N.EQ.0).OR.(M.EQ.0) ) */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Quick return when ALPHA.EQ.(0D+0) */ @@ -876,7 +877,7 @@ static real c_b27 = 1.f; } /* L20: */ } - return 0; + return; } if (lside) { @@ -1525,7 +1526,7 @@ static real c_b27 = 1.f; } } - return 0; + return; /* End of STFSM */ diff --git a/lapack-netlib/SRC/stftri.c b/lapack-netlib/SRC/stftri.c index 511ef565c9..50e1ae6fbd 100644 --- a/lapack-netlib/SRC/stftri.c +++ b/lapack-netlib/SRC/stftri.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stftri_(char *transr, char *uplo, char *diag, integer *n, +/* Subroutine */ void stftri_(char *transr, char *uplo, char *diag, integer *n, real *a, integer *info) { /* System generated locals */ @@ -727,9 +727,10 @@ f"> */ extern logical lsame_(char *, char *); logical lower; integer n1, n2; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int strtri_(char *, char *, integer *, real *, integer *, integer *); @@ -762,13 +763,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -810,7 +811,7 @@ f"> */ strtri_("L", diag, &n1, a, n, info); if (*info > 0) { - return 0; + return; } strmm_("R", "L", "N", diag, &n2, &n1, &c_b13, a, n, &a[n1], n); strtri_("U", diag, &n2, &a[*n], n, info) @@ -819,7 +820,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } strmm_("L", "U", "T", diag, &n2, &n1, &c_b18, &a[*n], n, &a[ n1], n); @@ -833,7 +834,7 @@ f"> */ strtri_("L", diag, &n1, &a[n2], n, info) ; if (*info > 0) { - return 0; + return; } strmm_("L", "L", "T", diag, &n1, &n2, &c_b13, &a[n2], n, a, n); strtri_("U", diag, &n2, &a[n1], n, info) @@ -842,7 +843,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } strmm_("R", "U", "N", diag, &n1, &n2, &c_b18, &a[n1], n, a, n); @@ -859,7 +860,7 @@ f"> */ strtri_("U", diag, &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } strmm_("L", "U", "N", diag, &n1, &n2, &c_b13, a, &n1, &a[n1 * n1], &n1); @@ -868,7 +869,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } strmm_("R", "L", "T", diag, &n1, &n2, &c_b18, &a[1], &n1, &a[ n1 * n1], &n1); @@ -880,7 +881,7 @@ f"> */ strtri_("U", diag, &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } strmm_("R", "U", "T", diag, &n2, &n1, &c_b13, &a[n2 * n2], & n2, a, &n2); @@ -889,7 +890,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } strmm_("L", "L", "N", diag, &n2, &n1, &c_b18, &a[n1 * n2], & n2, a, &n2); @@ -914,7 +915,7 @@ f"> */ i__1 = *n + 1; strtri_("L", diag, &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -926,7 +927,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -943,7 +944,7 @@ f"> */ i__1 = *n + 1; strtri_("L", diag, &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -955,7 +956,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -974,7 +975,7 @@ f"> */ strtri_("U", diag, &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } strmm_("L", "U", "N", diag, &k, &k, &c_b13, &a[k], &k, &a[k * (k + 1)], &k); @@ -983,7 +984,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } strmm_("R", "L", "T", diag, &k, &k, &c_b18, a, &k, &a[k * (k + 1)], &k) @@ -996,7 +997,7 @@ f"> */ strtri_("U", diag, &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } strmm_("R", "U", "T", diag, &k, &k, &c_b13, &a[k * (k + 1)], & k, a, &k); @@ -1005,7 +1006,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } strmm_("L", "L", "N", diag, &k, &k, &c_b18, &a[k * k], &k, a, &k); @@ -1013,7 +1014,7 @@ f"> */ } } - return 0; + return; /* End of STFTRI */ diff --git a/lapack-netlib/SRC/stfttp.c b/lapack-netlib/SRC/stfttp.c index 4a06bc8bc9..6b875cf1a0 100644 --- a/lapack-netlib/SRC/stfttp.c +++ b/lapack-netlib/SRC/stfttp.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stfttp_(char *transr, char *uplo, integer *n, real *arf, +/* Subroutine */ void stfttp_(char *transr, char *uplo, integer *n, real *arf, real *ap, integer *info) { /* System generated locals */ @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STFTTP", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -753,7 +753,7 @@ f"> */ } else { ap[0] = arf[0]; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1057,7 +1057,7 @@ f"> */ } - return 0; + return; /* End of STFTTP */ diff --git a/lapack-netlib/SRC/stfttr.c b/lapack-netlib/SRC/stfttr.c index c8dc6896f4..d199de832b 100644 --- a/lapack-netlib/SRC/stfttr.c +++ b/lapack-netlib/SRC/stfttr.c @@ -706,7 +706,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int stfttr_(char *transr, char *uplo, integer *n, real *arf, +/* Subroutine */ void stfttr_(char *transr, char *uplo, integer *n, real *arf, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -755,7 +755,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STFTTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -764,7 +764,7 @@ f"> */ if (*n == 1) { a[0] = arf[0]; } - return 0; + return; } /* Size of array ARF(0:nt-1) */ @@ -1034,7 +1034,7 @@ f"> */ } - return 0; + return; /* End of STFTTR */ diff --git a/lapack-netlib/SRC/stgevc.c b/lapack-netlib/SRC/stgevc.c index 05d12b69a2..dfdd9b068b 100644 --- a/lapack-netlib/SRC/stgevc.c +++ b/lapack-netlib/SRC/stgevc.c @@ -812,7 +812,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void stgevc_(char *side, char *howmny, logical *select, integer *n, real *s, integer *lds, real *p, integer *ldp, real *vl, integer *ldvl, real *vr, integer *ldvr, integer *mm, integer *m, real *work, integer *info) @@ -826,7 +826,7 @@ f"> */ integer ibeg, ieig, iend; real dmin__, temp, xmax, sump[4] /* was [2][2] */, sums[4] /* was [2][2] */, cim2a, cim2b, cre2a, cre2b; - extern /* Subroutine */ int slag2_(real *, integer *, real *, integer *, + extern /* Subroutine */ void slag2_(real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *); real temp2, bdiag[2]; integer i__, j; @@ -841,7 +841,7 @@ f"> */ logical compl; real anorm, bnorm; logical compr; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), slaln2_(logical *, integer *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *); @@ -855,7 +855,7 @@ f"> */ real bcoefi, ascale, bscale, creala; integer jr; real crealb; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); real bcoefr; integer jw, nw; extern real slamch_(char *); @@ -863,7 +863,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real xscale, bignum; logical ilcomp, ilcplx; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer ihwmny; real big; @@ -948,7 +948,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Count the number of eigenvectors to be computed */ @@ -1017,14 +1017,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = im; if (*n == 0) { - return 0; + return; } /* Machine Constants */ @@ -1216,7 +1216,7 @@ f"> */ bcoefi = -bcoefi; if (bcoefi == 0.f) { *info = je; - return 0; + return; } /* Scale to avoid over/underflow */ @@ -1627,7 +1627,7 @@ f"> */ temp2, &bcoefi); if (bcoefi == 0.f) { *info = je - 1; - return 0; + return; } /* Scale to avoid over/underflow */ @@ -1957,7 +1957,7 @@ f"> */ } } - return 0; + return; /* End of STGEVC */ diff --git a/lapack-netlib/SRC/stgex2.c b/lapack-netlib/SRC/stgex2.c index 9e974032b7..1d07cfe582 100644 --- a/lapack-netlib/SRC/stgex2.c +++ b/lapack-netlib/SRC/stgex2.c @@ -740,7 +740,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgex2_(logical *wantq, logical *wantz, integer *n, real +/* Subroutine */ void stgex2_(logical *wantq, logical *wantz, integer *n, real *a, integer *lda, real *b, integer *ldb, real *q, integer *ldq, real * z__, integer *ldz, integer *j1, integer *n1, integer *n2, real *work, integer *lwork, integer *info) @@ -756,49 +756,49 @@ f"> */ integer idum; real taul[4], dsum, taur[4], scpy[16] /* was [4][4] */, tcpy[16] /* was [4][4] */; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); real f, g; integer i__, m; real s[16] /* was [4][4] */, t[16] /* was [4][4] */, scale, bqra21, brqa21; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real licop[16] /* was [4][4] */; integer linfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real ircop[16] /* was [4][4] */, dnorm; integer iwork[4]; - extern /* Subroutine */ int slagv2_(real *, integer *, real *, integer *, + extern /* Subroutine */ void slagv2_(real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *, real *), sgeqr2_( integer *, integer *, real *, integer *, real *, real *, integer * ), sgerq2_(integer *, integer *, real *, integer *, real *, real * , integer *); real be[2], ai[2]; - extern /* Subroutine */ int sorg2r_(integer *, integer *, integer *, real + extern /* Subroutine */ void sorg2r_(integer *, integer *, integer *, real *, integer *, real *, real *, integer *), sorgr2_(integer *, integer *, integer *, real *, integer *, real *, real *, integer * ); real ar[2], sa, sb, li[16] /* was [4][4] */; - extern /* Subroutine */ int sorm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sorm2r_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *), sormr2_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer * , real *, integer *); real dscale, ir[16] /* was [4][4] */; - extern /* Subroutine */ int stgsy2_(char *, integer *, integer *, integer + extern /* Subroutine */ void stgsy2_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real * , integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *, integer *); real ss; extern real slamch_(char *); real ws; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slartg_(real *, real *, real *, real *, real *); real thresh; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slassq_(integer *, real *, integer *, real *, real *); real smlnum; @@ -838,10 +838,10 @@ f"> */ /* Quick return if possible */ if (*n <= 1 || *n1 <= 0 || *n2 <= 0) { - return 0; + return; } if (*n1 > *n || *j1 + *n1 > *n) { - return 0; + return; } m = *n1 + *n2; /* Computing MAX */ @@ -851,7 +851,7 @@ f"> */ /* Computing MAX */ i__1 = *n * m, i__2 = m * m << 1; work[1] = (real) f2cmax(i__1,i__2); - return 0; + return; } weak = FALSE_; @@ -990,7 +990,7 @@ f"> */ /* Exit with INFO = 0 if swap was successfully performed. */ - return 0; + return; } else { @@ -1288,7 +1288,7 @@ f"> */ /* Exit with INFO = 0 if swap was successfully performed. */ - return 0; + return; } @@ -1297,7 +1297,7 @@ f"> */ L70: *info = 1; - return 0; + return; /* End of STGEX2 */ diff --git a/lapack-netlib/SRC/stgexc.c b/lapack-netlib/SRC/stgexc.c index 710bf15bf9..23016212b5 100644 --- a/lapack-netlib/SRC/stgexc.c +++ b/lapack-netlib/SRC/stgexc.c @@ -733,7 +733,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgexc_(logical *wantq, logical *wantz, integer *n, real +/* Subroutine */ void stgexc_(logical *wantq, logical *wantz, integer *n, real *a, integer *lda, real *b, integer *ldb, real *q, integer *ldq, real * z__, integer *ldz, integer *ifst, integer *ilst, real *work, integer * lwork, integer *info) @@ -744,10 +744,11 @@ f"> */ /* Local variables */ integer here, lwmin; - extern /* Subroutine */ int stgex2_(logical *, logical *, integer *, real + extern /* Subroutine */ void stgex2_(logical *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, integer *, integer *, real *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); integer nbnext; logical lquery; integer nbf, nbl; @@ -814,15 +815,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGEXC", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Determine the first row of the specified block and find out */ @@ -855,7 +856,7 @@ f"> */ } } if (*ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -890,7 +891,7 @@ f"> */ &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here += nbnext; @@ -919,7 +920,7 @@ f"> */ nbnext, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -930,7 +931,7 @@ f"> */ &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; @@ -950,7 +951,7 @@ f"> */ here, &c__1, &nbnext, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here += 2; } else { @@ -962,7 +963,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; stgex2_(wantq, wantz, n, &a[a_offset], lda, &b[b_offset], @@ -970,7 +971,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; } @@ -1003,7 +1004,7 @@ f"> */ &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here -= nbnext; @@ -1032,7 +1033,7 @@ f"> */ c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -1043,7 +1044,7 @@ f"> */ nbnext, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; } else { @@ -1063,7 +1064,7 @@ f"> */ i__1, &c__2, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } here += -2; } else { @@ -1075,7 +1076,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; stgex2_(wantq, wantz, n, &a[a_offset], lda, &b[b_offset], @@ -1083,7 +1084,7 @@ f"> */ here, &c__1, &c__1, &work[1], lwork, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; } @@ -1095,7 +1096,7 @@ f"> */ } *ilst = here; work[1] = (real) lwmin; - return 0; + return; /* End of STGEXC */ diff --git a/lapack-netlib/SRC/stgsen.c b/lapack-netlib/SRC/stgsen.c index ae5c3ac286..f25029246a 100644 --- a/lapack-netlib/SRC/stgsen.c +++ b/lapack-netlib/SRC/stgsen.c @@ -964,7 +964,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgsen_(integer *ijob, logical *wantq, logical *wantz, +/* Subroutine */ void stgsen_(integer *ijob, logical *wantq, logical *wantz, logical *select, integer *n, real *a, integer *lda, real *b, integer * ldb, real *alphar, real *alphai, real *beta, real *q, integer *ldq, real *z__, integer *ldz, integer *m, real *pl, real *pr, real *dif, @@ -982,14 +982,14 @@ f"> */ integer ierr; real dsum; logical swap; - extern /* Subroutine */ int slag2_(real *, integer *, real *, integer *, + extern /* Subroutine */ void slag2_(real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *); integer i__, k, isave[3]; logical wantd; integer lwmin; logical wantp; integer n1, n2; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); logical wantd1, wantd2; integer kk; @@ -997,18 +997,19 @@ f"> */ integer ks; real rdscal; extern real slamch_(char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slacpy_( char *, integer *, integer *, real *, integer *, real *, integer * ), stgexc_(logical *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer * , integer *, integer *, real *, integer *, integer *); integer liwmin; - extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, + extern /* Subroutine */ void slassq_(integer *, real *, integer *, real *, real *); real smlnum; integer mn2; logical lquery; - extern /* Subroutine */ int stgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void stgsyl_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real * , integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *, integer *); @@ -1069,7 +1070,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGSEN", &i__1, (ftnlen)6); - return 0; + return; } /* Get machine constants */ @@ -1151,9 +1152,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -1448,7 +1449,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of STGSEN */ diff --git a/lapack-netlib/SRC/stgsja.c b/lapack-netlib/SRC/stgsja.c index aa7fcfe84f..ab455367c6 100644 --- a/lapack-netlib/SRC/stgsja.c +++ b/lapack-netlib/SRC/stgsja.c @@ -893,7 +893,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgsja_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void stgsja_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, integer *k, integer *l, real *a, integer *lda, real *b, integer *ldb, real *tola, real *tolb, real *alpha, real * beta, real *u, integer *ldu, real *v, integer *ldv, real *q, integer * @@ -905,12 +905,12 @@ f"> */ real r__1; /* Local variables */ - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer kcallmycycle, i__, j; real gamma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real a1; logical initq; real a2, a3, b1; @@ -918,10 +918,11 @@ f"> */ real b2, b3; logical wantu, wantv; real error, ssmin; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slags2_(logical *, real *, real *, real *, real *, - real *, real *, real *, real *, real *, real *, real *, real *), - xerbla_(char *, integer *, ftnlen), slapll_(integer *, real *, + real *, real *, real *, real *, real *, real *, real *, real *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slapll_(integer *, real *, integer *, real *, integer *, real *), slartg_(real *, real *, real *, real *, real *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); @@ -1000,7 +1001,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGSJA", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize U, V and Q, if necessary */ @@ -1239,7 +1240,7 @@ f"> */ L100: *ncallmycycle = kcallmycycle; - return 0; + return; /* End of STGSJA */ diff --git a/lapack-netlib/SRC/stgsna.c b/lapack-netlib/SRC/stgsna.c index c23ac4fada..4d7c7505d5 100644 --- a/lapack-netlib/SRC/stgsna.c +++ b/lapack-netlib/SRC/stgsna.c @@ -897,7 +897,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void stgsna_(char *job, char *howmny, logical *select, integer *n, real *a, integer *lda, real *b, integer *ldb, real *vl, integer *ldvl, real *vr, integer *ldvr, real *s, real *dif, integer * mm, integer *m, real *work, integer *lwork, integer *iwork, integer * @@ -918,7 +918,7 @@ f"> */ extern real sdot_(integer *, real *, integer *, real *, integer *); integer ilst; real rnrm; - extern /* Subroutine */ int slag2_(real *, integer *, real *, integer *, + extern /* Subroutine */ void slag2_(real *, integer *, real *, integer *, real *, real *, real *, real *, real *, real *); extern real snrm2_(integer *, real *, integer *); real root1, root2; @@ -926,7 +926,7 @@ f"> */ real scale; extern logical lsame_(char *, char *); real uhavi, uhbvi; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real tmpii, c1, c2; integer lwmin; @@ -943,7 +943,7 @@ f"> */ extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical wantbh, wantdf; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), stgexc_(logical *, logical *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, @@ -951,7 +951,7 @@ f"> */ logical somcon; real alprqt, smlnum; logical lquery; - extern /* Subroutine */ int stgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void stgsyl_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real * , integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *, integer *); @@ -1068,15 +1068,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGSNA", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1286,7 +1286,7 @@ f"> */ ; } work[1] = (real) lwmin; - return 0; + return; /* End of STGSNA */ diff --git a/lapack-netlib/SRC/stgsy2.c b/lapack-netlib/SRC/stgsy2.c index f37fdb95b3..92213c44bd 100644 --- a/lapack-netlib/SRC/stgsy2.c +++ b/lapack-netlib/SRC/stgsy2.c @@ -789,7 +789,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int stgsy2_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void stgsy2_(char *trans, integer *ijob, integer *m, integer * n, real *a, integer *lda, real *b, integer *ldb, real *c__, integer * ldc, real *d__, integer *ldd, real *e, integer *lde, real *f, integer *ldf, real *scale, real *rdsum, real *rdscal, integer *iwork, integer @@ -800,12 +800,12 @@ f"> */ d_offset, e_dim1, e_offset, f_dim1, f_offset, i__1, i__2, i__3; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer ierr, zdim, ipiv[8], jpiv[8], i__, j, k, p, q; real alpha, z__[64] /* was [8][8] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *), @@ -815,9 +815,10 @@ f"> */ integer *, integer *); integer ie, je, mb, nb, ii, jj, is, js; real scaloc; - extern /* Subroutine */ int slatdf_(integer *, integer *, real *, integer - *, real *, real *, real *, integer *, integer *), xerbla_(char *, - integer *, ftnlen), slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slatdf_(integer *, integer *, real *, integer + *, real *, real *, real *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); logical notran; real rhs[8]; @@ -891,7 +892,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGSY2", &i__1, (ftnlen)6); - return 0; + return; } /* Determine block structure of A */ @@ -1695,7 +1696,7 @@ f"> */ } } - return 0; + return; /* End of STGSY2 */ diff --git a/lapack-netlib/SRC/stgsyl.c b/lapack-netlib/SRC/stgsyl.c index 703a20065f..ae065a349c 100644 --- a/lapack-netlib/SRC/stgsyl.c +++ b/lapack-netlib/SRC/stgsyl.c @@ -816,7 +816,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stgsyl_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void stgsyl_(char *trans, integer *ijob, integer *m, integer * n, real *a, integer *lda, real *b, integer *ldb, real *c__, integer * ldc, real *d__, integer *ldd, real *e, integer *lde, real *f, integer *ldf, real *scale, real *dif, real *work, integer *lwork, integer * @@ -832,9 +832,9 @@ f"> */ integer ppqq, i__, j, k, p, q; extern logical lsame_(char *, char *); integer ifunc; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer linfo; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer lwmin; @@ -842,7 +842,7 @@ f"> */ integer ie, je, mb, nb; real dscale; integer is, js; - extern /* Subroutine */ int stgsy2_(char *, integer *, integer *, integer + extern /* Subroutine */ void stgsy2_(char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real * , integer *, real *, integer *, real *, integer *, real *, real *, real *, integer *, integer *, integer *); @@ -851,7 +851,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer iround; @@ -949,9 +949,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STGSYL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -963,7 +963,7 @@ f"> */ *dif = 0.f; } } - return 0; + return; } /* Determine optimal block sizes MB and NB */ @@ -1026,7 +1026,7 @@ f"> */ /* L30: */ } - return 0; + return; } /* Determine block structure of A */ @@ -1293,7 +1293,7 @@ f"> */ work[1] = (real) lwmin; - return 0; + return; /* End of STGSYL */ diff --git a/lapack-netlib/SRC/stpcon.c b/lapack-netlib/SRC/stpcon.c index 99b3a63531..8812ca8189 100644 --- a/lapack-netlib/SRC/stpcon.c +++ b/lapack-netlib/SRC/stpcon.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stpcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void stpcon_(char *norm, char *uplo, char *diag, integer *n, real *ap, real *rcond, real *work, integer *iwork, integer *info) { /* System generated locals */ @@ -655,10 +655,10 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); logical upper; real xnorm; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ix; extern real slamch_(char *); @@ -668,7 +668,7 @@ f"> */ logical onenrm; extern real slantp_(char *, char *, char *, integer *, real *, real *); char normin[1]; - extern /* Subroutine */ int slatps_(char *, char *, char *, char *, + extern /* Subroutine */ void slatps_(char *, char *, char *, char *, integer *, real *, real *, real *, real *, integer *); real smlnum; logical nounit; @@ -708,14 +708,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.f; - return 0; + return; } *rcond = 0.f; @@ -778,7 +778,7 @@ f"> */ } L20: - return 0; + return; /* End of STPCON */ diff --git a/lapack-netlib/SRC/stplqt.c b/lapack-netlib/SRC/stplqt.c index 325cb625c1..a52e405352 100644 --- a/lapack-netlib/SRC/stplqt.c +++ b/lapack-netlib/SRC/stplqt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stplqt_(integer *m, integer *n, integer *l, integer *mb, +/* Subroutine */ void stplqt_(integer *m, integer *n, integer *l, integer *mb, real *a, integer *lda, real *b, integer *ldb, real *t, integer *ldt, real *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), stprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *), stplqt2_(integer *, integer *, integer *, real *, @@ -757,13 +758,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPLQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *m; @@ -797,7 +798,7 @@ f"> */ a_dim1], lda, &b[i__ + ib + b_dim1], ldb, &work[1], &i__4); } } - return 0; + return; /* End of STPLQT */ diff --git a/lapack-netlib/SRC/stplqt2.c b/lapack-netlib/SRC/stplqt2.c index 9dc09583dc..99f2cdc978 100644 --- a/lapack-netlib/SRC/stplqt2.c +++ b/lapack-netlib/SRC/stplqt2.c @@ -693,7 +693,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stplqt2_(integer *m, integer *n, integer *l, real *a, +/* Subroutine */ void stplqt2_(integer *m, integer *n, integer *l, real *a, integer *lda, real *b, integer *ldb, real *t, integer *ldt, integer * info) { @@ -702,15 +702,16 @@ is composed of a triangular block and a pentagonal block, using the compact WY r i__3; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, j, p; real alpha; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); integer mp, np; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarfg_( integer *, real *, real *, integer *, real *); @@ -754,13 +755,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("STPLQT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *m; @@ -860,6 +861,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of STPLQT2 */ - return 0; + return; } /* stplqt2_ */ diff --git a/lapack-netlib/SRC/stpmlqt.c b/lapack-netlib/SRC/stpmlqt.c index 4018fd540f..73868f9b75 100644 --- a/lapack-netlib/SRC/stpmlqt.c +++ b/lapack-netlib/SRC/stpmlqt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stpmlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void stpmlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *mb, real *v, integer *ldv, real *t, integer *ldt, real *a, integer *lda, real *b, integer *ldb, real * work, integer *info) @@ -740,7 +740,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, lb, nb, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), stprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *); @@ -811,12 +812,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("STPMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -905,7 +906,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of STPMLQT */ diff --git a/lapack-netlib/SRC/stpmqrt.c b/lapack-netlib/SRC/stpmqrt.c index 5dbe80df19..a30bfd4208 100644 --- a/lapack-netlib/SRC/stpmqrt.c +++ b/lapack-netlib/SRC/stpmqrt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stpmqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void stpmqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *nb, real *v, integer *ldv, real *t, integer *ldt, real *a, integer *lda, real *b, integer *ldb, real * work, integer *info) @@ -740,7 +740,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, lb, mb, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), stprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *); @@ -813,12 +814,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("STPMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -907,7 +908,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of STPMQRT */ diff --git a/lapack-netlib/SRC/stpqrt.c b/lapack-netlib/SRC/stpqrt.c index eb84444063..d5b3837b25 100644 --- a/lapack-netlib/SRC/stpqrt.c +++ b/lapack-netlib/SRC/stpqrt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stpqrt_(integer *m, integer *n, integer *l, integer *nb, +/* Subroutine */ void stpqrt_(integer *m, integer *n, integer *l, integer *nb, real *a, integer *lda, real *b, integer *ldb, real *t, integer *ldt, real *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, mb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), stprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void stprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *, real *, integer *), stpqrt2_(integer *, integer *, integer *, real *, @@ -757,13 +758,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -797,7 +798,7 @@ f"> */ , &ib); } } - return 0; + return; /* End of STPQRT */ diff --git a/lapack-netlib/SRC/stpqrt2.c b/lapack-netlib/SRC/stpqrt2.c index 7b67256047..df9fd59bf8 100644 --- a/lapack-netlib/SRC/stpqrt2.c +++ b/lapack-netlib/SRC/stpqrt2.c @@ -690,7 +690,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stpqrt2_(integer *m, integer *n, integer *l, real *a, +/* Subroutine */ void stpqrt2_(integer *m, integer *n, integer *l, real *a, integer *lda, real *b, integer *ldb, real *t, integer *ldt, integer * info) { @@ -699,15 +699,16 @@ is composed of a triangular block and a pentagonal block, using the compact WY r i__3; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer i__, j, p; real alpha; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *); integer mp, np; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarfg_( integer *, real *, real *, integer *, real *); @@ -751,13 +752,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("STPQRT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *n; @@ -851,6 +852,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of STPQRT2 */ - return 0; + return; } /* stpqrt2_ */ diff --git a/lapack-netlib/SRC/stprfb.c b/lapack-netlib/SRC/stprfb.c index 231d592237..1de2583b44 100644 --- a/lapack-netlib/SRC/stprfb.c +++ b/lapack-netlib/SRC/stprfb.c @@ -766,7 +766,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stprfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void stprfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, real *v, integer *ldv, real *t, integer *ldt, real *a, integer *lda, real *b, integer *ldb, real *work, integer *ldwork) @@ -779,11 +779,11 @@ f"> */ logical left, backward; integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical right; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ); integer kp, mp, np; @@ -820,7 +820,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0 || *k <= 0 || *l < 0) { - return 0; + return; } if (lsame_(storev, "C")) { @@ -1471,7 +1471,7 @@ f"> */ } - return 0; + return; /* End of STPRFB */ diff --git a/lapack-netlib/SRC/stprfb.f b/lapack-netlib/SRC/stprfb.f index 64e8b34f5e..d91a80dfbb 100644 --- a/lapack-netlib/SRC/stprfb.f +++ b/lapack-netlib/SRC/stprfb.f @@ -1,4 +1,4 @@ -*> \brief \b STPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b STPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * @@ -37,7 +37,7 @@ *> \verbatim *> *> STPRFB applies a real "triangular-pentagonal" block reflector H or its -*> conjugate transpose H^H to a real matrix C, which is composed of two +*> transpose H**T to a real matrix C, which is composed of two *> blocks A and B, either from the left or right. *> *> \endverbatim @@ -48,15 +48,15 @@ *> \param[in] SIDE *> \verbatim *> SIDE is CHARACTER*1 -*> = 'L': apply H or H^H from the Left -*> = 'R': apply H or H^H from the Right +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right *> \endverbatim *> *> \param[in] TRANS *> \verbatim *> TRANS is CHARACTER*1 *> = 'N': apply H (No transpose) -*> = 'C': apply H^H (Conjugate transpose) +*> = 'T': apply H**T (Transpose) *> \endverbatim *> *> \param[in] DIRECT @@ -145,7 +145,7 @@ *> (LDA,N) if SIDE = 'L' or (LDA,K) if SIDE = 'R' *> On entry, the K-by-N or M-by-K matrix A. *> On exit, A is overwritten by the corresponding block of -*> H*C or H^H*C or C*H or C*H^H. See Further Details. +*> H*C or H**T*C or C*H or C*H**T. See Further Details. *> \endverbatim *> *> \param[in] LDA @@ -161,7 +161,7 @@ *> B is REAL array, dimension (LDB,N) *> On entry, the M-by-N matrix B. *> On exit, B is overwritten by the corresponding block of -*> H*C or H^H*C or C*H or C*H^H. See Further Details. +*> H*C or H**T*C or C*H or C*H**T. See Further Details. *> \endverbatim *> *> \param[in] LDB @@ -327,13 +327,13 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * Let W = [ I ] (K-by-K) * [ V ] (M-by-K) * -* Form H C or H^H C where C = [ A ] (K-by-N) -* [ B ] (M-by-N) +* Form H C or H**T C where C = [ A ] (K-by-N) +* [ B ] (M-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - T (A + V^H B) or A = A - T^H (A + V^H B) -* B = B - V T (A + V^H B) or B = B - V T^H (A + V^H B) +* A = A - T (A + V**T B) or A = A - T**T (A + V**T B) +* B = B - V T (A + V**T B) or B = B - V T**T (A + V**T B) * * --------------------------------------------------------------------------- * @@ -388,12 +388,12 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * Let W = [ I ] (K-by-K) * [ V ] (N-by-K) * -* Form C H or C H^H where C = [ A B ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ A B ] (A is M-by-K, B is M-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - (A + B V) T or A = A - (A + B V) T^H -* B = B - (A + B V) T V^H or B = B - (A + B V) T^H V^H +* A = A - (A + B V) T or A = A - (A + B V) T**T +* B = B - (A + B V) T V**T or B = B - (A + B V) T**T V**T * * --------------------------------------------------------------------------- * @@ -448,13 +448,13 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * Let W = [ V ] (M-by-K) * [ I ] (K-by-K) * -* Form H C or H^H C where C = [ B ] (M-by-N) -* [ A ] (K-by-N) +* Form H C or H**T C where C = [ B ] (M-by-N) +* [ A ] (K-by-N) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - T (A + V^H B) or A = A - T^H (A + V^H B) -* B = B - V T (A + V^H B) or B = B - V T^H (A + V^H B) +* A = A - T (A + V**T B) or A = A - T**T (A + V**T B) +* B = B - V T (A + V**T B) or B = B - V T**T (A + V**T B) * * --------------------------------------------------------------------------- * @@ -510,12 +510,12 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * Let W = [ V ] (N-by-K) * [ I ] (K-by-K) * -* Form C H or C H^H where C = [ B A ] (B is M-by-N, A is M-by-K) +* Form C H or C H**T where C = [ B A ] (B is M-by-N, A is M-by-K) * -* H = I - W T W^H or H^H = I - W T^H W^H +* H = I - W T W**T or H**T = I - W T**T W**T * -* A = A - (A + B V) T or A = A - (A + B V) T^H -* B = B - (A + B V) T V^H or B = B - (A + B V) T^H V^H +* A = A - (A + B V) T or A = A - (A + B V) T**T +* B = B - (A + B V) T V**T or B = B - (A + B V) T**T V**T * * --------------------------------------------------------------------------- * @@ -569,13 +569,13 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * * Let W = [ I V ] ( I is K-by-K, V is K-by-M ) * -* Form H C or H^H C where C = [ A ] (K-by-N) -* [ B ] (M-by-N) +* Form H C or H**T C where C = [ A ] (K-by-N) +* [ B ] (M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - T (A + V B) or A = A - T^H (A + V B) -* B = B - V^H T (A + V B) or B = B - V^H T^H (A + V B) +* A = A - T (A + V B) or A = A - T**T (A + V B) +* B = B - V**T T (A + V B) or B = B - V**T T**T (A + V B) * * --------------------------------------------------------------------------- * @@ -629,12 +629,12 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * * Let W = [ I V ] ( I is K-by-K, V is K-by-N ) * -* Form C H or C H^H where C = [ A B ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ A B ] (A is M-by-K, B is M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - (A + B V^H) T or A = A - (A + B V^H) T^H -* B = B - (A + B V^H) T V or B = B - (A + B V^H) T^H V +* A = A - (A + B V**T) T or A = A - (A + B V**T) T**T +* B = B - (A + B V**T) T V or B = B - (A + B V**T) T**T V * * --------------------------------------------------------------------------- * @@ -688,13 +688,13 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * * Let W = [ V I ] ( I is K-by-K, V is K-by-M ) * -* Form H C or H^H C where C = [ B ] (M-by-N) -* [ A ] (K-by-N) +* Form H C or H**T C where C = [ B ] (M-by-N) +* [ A ] (K-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - T (A + V B) or A = A - T^H (A + V B) -* B = B - V^H T (A + V B) or B = B - V^H T^H (A + V B) +* A = A - T (A + V B) or A = A - T**T (A + V B) +* B = B - V**T T (A + V B) or B = B - V**T T**T (A + V B) * * --------------------------------------------------------------------------- * @@ -748,12 +748,12 @@ SUBROUTINE STPRFB( SIDE, TRANS, DIRECT, STOREV, M, N, K, L, * * Let W = [ V I ] ( I is K-by-K, V is K-by-N ) * -* Form C H or C H^H where C = [ B A ] (A is M-by-K, B is M-by-N) +* Form C H or C H**T where C = [ B A ] (A is M-by-K, B is M-by-N) * -* H = I - W^H T W or H^H = I - W^H T^H W +* H = I - W**T T W or H**T = I - W**T T**T W * -* A = A - (A + B V^H) T or A = A - (A + B V^H) T^H -* B = B - (A + B V^H) T V or B = B - (A + B V^H) T^H V +* A = A - (A + B V**T) T or A = A - (A + B V**T) T**T +* B = B - (A + B V**T) T V or B = B - (A + B V**T) T**T V * * --------------------------------------------------------------------------- * diff --git a/lapack-netlib/SRC/stprfs.c b/lapack-netlib/SRC/stprfs.c index 774f006e28..5a978aa985 100644 --- a/lapack-netlib/SRC/stprfs.c +++ b/lapack-netlib/SRC/stprfs.c @@ -688,7 +688,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stprfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void stprfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, real *ap, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) { @@ -704,7 +704,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), stpmv_(char *, char *, char *, integer *, real *, real *, integer *), stpsv_(char *, char *, @@ -771,7 +771,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -783,7 +783,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1058,7 +1058,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of STPRFS */ diff --git a/lapack-netlib/SRC/stptri.c b/lapack-netlib/SRC/stptri.c index 1d53469cb9..776eecd93a 100644 --- a/lapack-netlib/SRC/stptri.c +++ b/lapack-netlib/SRC/stptri.c @@ -630,7 +630,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stptri_(char *uplo, char *diag, integer *n, real *ap, +/* Subroutine */ void stptri_(char *uplo, char *diag, integer *n, real *ap, integer *info) { /* System generated locals */ @@ -639,9 +639,9 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int stpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpmv_(char *, char *, char *, integer *, real *, real *, integer *); integer jc, jj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -678,7 +678,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Check for singularity if non-unit. */ @@ -690,7 +690,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { jj += *info; if (ap[jj] == 0.f) { - return 0; + return; } /* L10: */ } @@ -699,7 +699,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ap[jj] == 0.f) { - return 0; + return; } jj = jj + *n - *info + 1; /* L20: */ @@ -761,7 +761,7 @@ f"> */ } } - return 0; + return; /* End of STPTRI */ diff --git a/lapack-netlib/SRC/stptrs.c b/lapack-netlib/SRC/stptrs.c index 8143010805..2f9514dfa0 100644 --- a/lapack-netlib/SRC/stptrs.c +++ b/lapack-netlib/SRC/stptrs.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stptrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void stptrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, real *ap, real *b, integer *ldb, integer *info) { /* System generated locals */ @@ -653,7 +653,7 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int stpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void stpsv_(char *, char *, char *, integer *, real *, real *, integer *); integer jc; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -698,13 +698,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -715,7 +715,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ap[jc + *info - 1] == 0.f) { - return 0; + return; } jc += *info; /* L10: */ @@ -725,7 +725,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (ap[jc] == 0.f) { - return 0; + return; } jc = jc + *n - *info + 1; /* L20: */ @@ -742,7 +742,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of STPTRS */ diff --git a/lapack-netlib/SRC/stpttf.c b/lapack-netlib/SRC/stpttf.c index c7c32fd688..9ea179e03a 100644 --- a/lapack-netlib/SRC/stpttf.c +++ b/lapack-netlib/SRC/stpttf.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stpttf_(char *transr, char *uplo, integer *n, real *ap, +/* Subroutine */ void stpttf_(char *transr, char *uplo, integer *n, real *ap, real *arf, integer *info) { /* System generated locals */ @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -753,7 +753,7 @@ f"> */ } else { arf[0] = ap[0]; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1041,7 +1041,7 @@ f"> */ } - return 0; + return; /* End of STPTTF */ diff --git a/lapack-netlib/SRC/stpttr.c b/lapack-netlib/SRC/stpttr.c index b07698b10e..2ea5beb8e2 100644 --- a/lapack-netlib/SRC/stpttr.c +++ b/lapack-netlib/SRC/stpttr.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int stpttr_(char *uplo, integer *n, real *ap, real *a, +/* Subroutine */ void stpttr_(char *uplo, integer *n, real *ap, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STPTTR", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -683,7 +683,7 @@ f"> */ } - return 0; + return; /* End of STPTTR */ diff --git a/lapack-netlib/SRC/strcon.c b/lapack-netlib/SRC/strcon.c index 880a51be5c..a81e5bca67 100644 --- a/lapack-netlib/SRC/strcon.c +++ b/lapack-netlib/SRC/strcon.c @@ -649,7 +649,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void strcon_(char *norm, char *uplo, char *diag, integer *n, real *a, integer *lda, real *rcond, real *work, integer *iwork, integer *info) { @@ -663,10 +663,10 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; real anorm; - extern /* Subroutine */ int srscl_(integer *, real *, real *, integer *); + extern /* Subroutine */ void srscl_(integer *, real *, real *, integer *); logical upper; real xnorm; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer ix; extern real slamch_(char *); @@ -677,7 +677,7 @@ f"> */ char normin[1]; extern real slantr_(char *, char *, char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void slatrs_(char *, char *, char *, char *, integer *, real *, integer *, real *, real *, real *, integer *); real smlnum; logical nounit; @@ -721,14 +721,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.f; - return 0; + return; } *rcond = 0.f; @@ -791,7 +791,7 @@ f"> */ } L20: - return 0; + return; /* End of STRCON */ diff --git a/lapack-netlib/SRC/strevc.c b/lapack-netlib/SRC/strevc.c index cf1afd1142..2dae7d8f59 100644 --- a/lapack-netlib/SRC/strevc.c +++ b/lapack-netlib/SRC/strevc.c @@ -739,7 +739,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int strevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void strevc_(char *side, char *howmny, logical *select, integer *n, real *t, integer *ldt, real *vl, integer *ldvl, real *vr, integer *ldvr, integer *mm, integer *m, real *work, integer *info) { @@ -759,25 +759,25 @@ f"> */ integer jnxt, i__, j, k; real scale, x[4] /* was [2][2] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real remax; logical leftv; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical bothv; real vcrit; logical somev; integer j1, j2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); integer n2; real xnorm; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *), slaln2_(logical *, integer *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *); integer ii, ki; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ip, is; real wi; extern real slamch_(char *); @@ -881,13 +881,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STREVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set the constants to control overflow. */ @@ -1790,7 +1790,7 @@ f"> */ } - return 0; + return; /* End of STREVC */ diff --git a/lapack-netlib/SRC/strevc3.c b/lapack-netlib/SRC/strevc3.c index f623b84f51..5c6b3770a8 100644 --- a/lapack-netlib/SRC/strevc3.c +++ b/lapack-netlib/SRC/strevc3.c @@ -757,7 +757,7 @@ static logical c_true = TRUE_; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int strevc3_(char *side, char *howmny, logical *select, +/* Subroutine */ void strevc3_(char *side, char *howmny, logical *select, integer *n, real *t, integer *ldt, real *vl, integer *ldvl, real *vr, integer *ldvr, integer *mm, integer *m, real *work, integer *lwork, integer *info) @@ -780,28 +780,28 @@ static logical c_true = TRUE_; integer jnxt, i__, j, k; real scale, x[4] /* was [2][2] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real remax; logical leftv; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); logical bothv; real vcrit; logical somev; integer j1, j2; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *); real xnorm; - extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, + extern /* Subroutine */ void saxpy_(integer *, real *, real *, integer *, real *, integer *); integer iscomplex[128]; - extern /* Subroutine */ int slaln2_(logical *, integer *, integer *, real + extern /* Subroutine */ void slaln2_(logical *, integer *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *); integer nb, ii, ki; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer ip, is, iv; real wi; extern real slamch_(char *); @@ -811,7 +811,7 @@ static logical c_true = TRUE_; integer *, integer *, ftnlen, ftnlen); real bignum; extern integer isamax_(integer *, real *, integer *); - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *), slaset_(char *, integer *, integer *, real *, real *, real *, integer *); logical rightv; @@ -928,15 +928,15 @@ static logical c_true = TRUE_; if (*info != 0) { i__2 = -(*info); xerbla_("STREVC3", &i__2, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Use blocked version of back-transformation if sufficient workspace. */ @@ -2049,7 +2049,7 @@ static logical c_true = TRUE_; } } - return 0; + return; /* End of STREVC3 */ diff --git a/lapack-netlib/SRC/strexc.c b/lapack-netlib/SRC/strexc.c index 45f376b6b1..978ff19696 100644 --- a/lapack-netlib/SRC/strexc.c +++ b/lapack-netlib/SRC/strexc.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strexc_(char *compq, integer *n, real *t, integer *ldt, +/* Subroutine */ void strexc_(char *compq, integer *n, real *t, integer *ldt, real *q, integer *ldq, integer *ifst, integer *ilst, real *work, integer *info) { @@ -672,7 +672,8 @@ f"> */ integer here; extern logical lsame_(char *, char *); logical wantq; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), slaexc_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaexc_( logical *, integer *, real *, integer *, real *, integer *, integer *, integer *, integer *, real *, integer *); integer nbnext, nbf, nbl; @@ -717,13 +718,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STREXC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Determine the first row of specified block */ @@ -757,7 +758,7 @@ f"> */ } if (*ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -791,7 +792,7 @@ f"> */ nbf, &nbnext, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here += nbnext; @@ -819,7 +820,7 @@ f"> */ c__1, &nbnext, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -843,7 +844,7 @@ f"> */ here, &c__1, &nbnext, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here += 2; } else { @@ -885,7 +886,7 @@ f"> */ nbnext, &nbf, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here -= nbnext; @@ -913,7 +914,7 @@ f"> */ nbnext, &c__1, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } if (nbnext == 1) { @@ -938,7 +939,7 @@ f"> */ i__1, &c__2, &c__1, &work[1], info); if (*info != 0) { *ilst = here; - return 0; + return; } here += -2; } else { @@ -960,7 +961,7 @@ f"> */ } *ilst = here; - return 0; + return; /* End of STREXC */ diff --git a/lapack-netlib/SRC/strrfs.c b/lapack-netlib/SRC/strrfs.c index 32a61be0cd..5a7d0dd5d0 100644 --- a/lapack-netlib/SRC/strrfs.c +++ b/lapack-netlib/SRC/strrfs.c @@ -695,7 +695,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void strrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, real *x, integer *ldx, real *ferr, real *berr, real *work, integer *iwork, integer *info) @@ -713,7 +713,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), saxpy_(integer *, real *, real *, integer *, real *, integer *), strmv_(char *, char *, char *, integer *, real *, integer *, real *, integer *), strsv_( @@ -784,7 +784,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -796,7 +796,7 @@ f"> */ berr[j] = 0.f; /* L10: */ } - return 0; + return; } if (notran) { @@ -1061,7 +1061,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of STRRFS */ diff --git a/lapack-netlib/SRC/strsen.c b/lapack-netlib/SRC/strsen.c index 2b3a43d2ba..2becc7318d 100644 --- a/lapack-netlib/SRC/strsen.c +++ b/lapack-netlib/SRC/strsen.c @@ -826,7 +826,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int strsen_(char *job, char *compq, logical *select, integer +/* Subroutine */ void strsen_(char *job, char *compq, logical *select, integer *n, real *t, integer *ldt, real *q, integer *ldq, real *wr, real *wi, integer *m, real *s, real *sep, real *work, integer *lwork, integer * iwork, integer *liwork, integer *info) @@ -847,20 +847,20 @@ f"> */ logical wantq, wants; real rnorm; integer n1, n2; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); integer kk, nn, ks; extern real slange_(char *, integer *, integer *, real *, integer *, real *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical wantbh; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); integer liwmin; - extern /* Subroutine */ int strexc_(char *, integer *, real *, integer *, + extern /* Subroutine */ void strexc_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *); logical wantsp, lquery; - extern /* Subroutine */ int strsyl_(char *, char *, integer *, integer *, + extern /* Subroutine */ void strsyl_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer * , real *, integer *); real est; @@ -972,9 +972,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -1111,7 +1111,7 @@ f"> */ work[1] = (real) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of STRSEN */ diff --git a/lapack-netlib/SRC/strsna.c b/lapack-netlib/SRC/strsna.c index 61edf044c2..a629a69c36 100644 --- a/lapack-netlib/SRC/strsna.c +++ b/lapack-netlib/SRC/strsna.c @@ -778,7 +778,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int strsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void strsna_(char *job, char *howmny, logical *select, integer *n, real *t, integer *ldt, real *vl, integer *ldvl, real *vr, integer *ldvr, real *s, real *sep, integer *mm, integer *m, real * work, integer *ldwork, integer *iwork, integer *info) @@ -807,21 +807,21 @@ f"> */ logical wants; real dummy[1]; integer n2; - extern /* Subroutine */ int slacn2_(integer *, real *, real *, integer *, + extern /* Subroutine */ void slacn2_(integer *, real *, real *, integer *, real *, integer *, integer *); extern real slapy2_(real *, real *); real cs; - extern /* Subroutine */ int slabad_(real *, real *); + extern /* Subroutine */ void slabad_(real *, real *); integer nn, ks; real sn, mu; extern real slamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real bignum; logical wantbh; - extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slacpy_(char *, integer *, integer *, real *, integer *, real *, integer *); logical somcon; - extern /* Subroutine */ int slaqtr_(logical *, logical *, integer *, real + extern /* Subroutine */ void slaqtr_(logical *, logical *, integer *, real *, integer *, real *, real *, real *, real *, real *, integer *), strexc_(char *, integer *, real *, integer *, real *, integer *, integer *, integer *, real *, integer *); @@ -924,19 +924,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRSNA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (somcon) { if (! select[1]) { - return 0; + return; } } if (wants) { @@ -945,7 +945,7 @@ f"> */ if (wantsp) { sep[1] = (r__1 = t[t_dim1 + 1], abs(r__1)); } - return 0; + return; } /* Get machine constants */ @@ -1182,7 +1182,7 @@ f"> */ L60: ; } - return 0; + return; /* End of STRSNA */ diff --git a/lapack-netlib/SRC/strsyl.c b/lapack-netlib/SRC/strsyl.c index 1ca4bda1ed..0cc08139e0 100644 --- a/lapack-netlib/SRC/strsyl.c +++ b/lapack-netlib/SRC/strsyl.c @@ -680,7 +680,7 @@ f"> */ /* > \ingroup realSYcomputational */ /* ===================================================================== */ -/* Subroutine */ int strsyl_(char *trana, char *tranb, integer *isgn, integer +/* Subroutine */ void strsyl_(char *trana, char *tranb, integer *isgn, integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, real * c__, integer *ldc, real *scale, integer *info) { @@ -697,14 +697,14 @@ f"> */ integer j, k, l; real x[4] /* was [2][2] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer knext, lnext, k1, k2, l1, l2; real xnorm; - extern /* Subroutine */ int slaln2_(logical *, integer *, integer *, real + extern /* Subroutine */ void slaln2_(logical *, integer *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, real *, integer *, real *, real *, integer *); real a11, db; - extern /* Subroutine */ int slasy2_(logical *, logical *, integer *, + extern /* Subroutine */ void slasy2_(logical *, logical *, integer *, integer *, integer *, real *, integer *, real *, integer *, real * , integer *, real *, real *, integer *, real *, integer *), slabad_(real *, real *); @@ -766,14 +766,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRSYL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *scale = 1.f; if (*m == 0 || *n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -1875,7 +1875,7 @@ f"> */ } - return 0; + return; /* End of STRSYL */ diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c new file mode 100644 index 0000000000..a2a1d0a629 --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.c @@ -0,0 +1,2066 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b STRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > STRSYL3 solves the real Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**T, and A and B are both upper quasi- */ +/* > triangular. A is M-by-M and B is N-by-N; the right hand side C and */ +/* > the solution X are M-by-N; and scale is an output scale factor, set */ +/* > <= 1 to avoid overflow in X. */ +/* > */ +/* > A and B must be in Schur canonical form (as returned by SHSEQR), that */ +/* > is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */ +/* > each 2-by-2 diagonal block has its diagonal elements equal and its */ +/* > off-diagonal elements of opposite sign. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'T': op(A) = A**T (Transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'T': op(B) = B**T (Transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose = Transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is REAL array, dimension (LDA,M) */ +/* > The upper quasi-triangular matrix A, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is REAL array, dimension (LDB,N) */ +/* > The upper quasi-triangular matrix B, in Schur canonical form. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is REAL array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is REAL */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] IWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */ +/* > On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LIWORK */ +/* > \verbatim */ +/* > IWORK is INTEGER */ +/* > The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) */ +/* > + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */ +/* > */ +/* > If LIWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimension of the IWORK array, */ +/* > returns this value as the first entry of the IWORK array, and */ +/* > no error message related to LIWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is REAL array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void strsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, + real *c__, integer *ldc, real *scale, integer *iwork, integer *liwork, + real *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1, r__2, r__3; + + /* Local variables */ + real scal, anrm, bnrm, cnrm; + integer awrk, bwrk; + logical skip; + real *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), + sgemm_(char *, char *, integer *, integer *, integer *, real *, + real *, integer *, real *, integer *, real *, real *, integer *); + integer i1, i2, j1, j2, k1, k2, l1; +// extern integer myexp_(real *); + integer l2, nb, pc, jj, ll; + real scaloc; + extern real slamch_(char *), slange_(char *, integer *, integer *, + real *, integer *, real *); + real scamin; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + real bignum; + extern /* Subroutine */ void slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *); + extern real slarmm_(real *, real *, real *); + logical notrna, notrnb; + real smlnum; + logical lquery; + extern /* Subroutine */ void strsyl_(char *, char *, integer *, integer *, + integer *, real *, integer *, real *, integer *, real *, integer * + , real *, integer *); + integer nba, nbb; + real buf, sgn; + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + --iwork; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "STRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *liwork == -1 || *ldswork == -1; + iwork[1] = nba + nbb + 2; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "T") && ! lsame_( + trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T") && ! + lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } else if (! lquery && *liwork < iwork[1]) { + *info = -14; + } else if (! lquery && *ldswork < f2cmax(nba,nbb)) { + *info = -16; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("STRSYL3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + + *scale = 1.f; + if (*m == 0 || *n == 0) { + return; + } + +/* Use unblocked code for small problems or if insufficient */ +/* workspaces are provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) { + strsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return; + } + + +/* REAL WNRM( MAX( M, N ) ) */ + wnrm=(real*)malloc (f2cmax(*m,*n)*sizeof(real)); + +/* Set constants to control overflow */ + + smlnum = slamch_("S"); + bignum = 1.f / smlnum; + +/* Partition A such that 2-by-2 blocks on the diagonal are not split */ + + skip = FALSE_; + i__1 = nba; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[i__] = (i__ - 1) * nb + 1; + } + iwork[nba + 1] = *m + 1; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[k]; + l2 = iwork[k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *m) { +/* A( M, M ) is a 1-by-1 block */ + mycycle_(); + } + if (a[l + (l + 1) * a_dim1] != 0.f && a[l + 1 + l * a_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[k + 1]) { + ++iwork[k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[nba + 1] = *m + 1; + if (iwork[nba] >= iwork[nba + 1]) { + iwork[nba] = iwork[nba + 1]; + --nba; + } + +/* Partition B such that 2-by-2 blocks on the diagonal are not split */ + + pc = nba + 1; + skip = FALSE_; + i__1 = nbb; + for (i__ = 1; i__ <= i__1; ++i__) { + iwork[pc + i__] = (i__ - 1) * nb + 1; + } + iwork[pc + nbb + 1] = *n + 1; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + l1 = iwork[pc + k]; + l2 = iwork[pc + k + 1] - 1; + i__2 = l2; + for (l = l1; l <= i__2; ++l) { + if (skip) { + skip = FALSE_; + mycycle_(); + } + if (l >= *n) { +/* B( N, N ) is a 1-by-1 block */ + mycycle_(); + } + if (b[l + (l + 1) * b_dim1] != 0.f && b[l + 1 + l * b_dim1] != + 0.f) { +/* Check if 2-by-2 block is split */ + if (l + 1 == iwork[pc + k + 1]) { + ++iwork[pc + k + 1]; + mycycle_(); + } + skip = TRUE_; + } + } + } + iwork[pc + nbb + 1] = *n + 1; + if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) { + iwork[pc + nbb] = iwork[pc + nbb + 1]; + --nbb; + } + +/* Set local scaling factors - must never attain zero. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.f; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.f; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = iwork[l]; + l2 = iwork[l + 1]; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[pc + k]; + k2 = iwork[pc + k + 1]; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = slange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = slange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (real) (*isgn); + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**T*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__3 = k2 - k1; + i__4 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = slange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = slange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + sscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + sgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = slange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + sscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "N", &i__4, &i__5, &i__6, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**T*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__2 = k2 - k1; + i__3 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = slange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + sscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + sgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**T = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = iwork[k]; + k2 = iwork[k + 1]; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + + i__1 = k2 - k1; + i__2 = l2 - l1; + strsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.f) { + if (scaloc == 0.f) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.f; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] + / pow_ri(&c_b19, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = slange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = iwork[i__]; + i2 = iwork[i__ + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = slarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, & + c_b32, &c__[i1 + l1 * c_dim1], ldc); + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */ + + j1 = iwork[pc + j]; + j2 = iwork[pc + j + 1]; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = slange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(r__1,r__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = slarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.f) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_ri(&c_b19, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + r__1 = bignum, r__2 = swork[ll + jj * + swork_dim1] / pow_ri(&c_b19, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_ri(&c_b19, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_ri(&c_b19, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.f) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + r__1 = -sgn; + sgemm_("N", "T", &i__2, &i__3, &i__4, &r__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32, + &c__[k1 + j1 * c_dim1], ldc); + } + } + } + + } + + free(wnrm); +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + r__1 = *scale, r__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(r__1,r__2); + } + } + + if (*scale == 0.f) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + return; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = iwork[k]; + k2 = iwork[k + 1]; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = iwork[pc + l]; + l2 = iwork[pc + l + 1]; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.f) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1.f && buf > 0.f) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + r__1 = *scale / smlnum, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + *scale /= scaloc; + } + if (buf != 1.f && buf > 0.f) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + + scal = c__[c_dim1 + 1]; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + r__2 = scal, r__3 = (r__1 = c__[k + l * c_dim1], abs(r__1)); + scal = f2cmax(r__2,r__3); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + r__1 = bignum / scal, r__2 = 1.f / buf; + scaloc = f2cmin(r__1,r__2); + buf *= scaloc; + slascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc, + &iwork[1]); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + iwork[1] = nba + nbb + 2; + swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba); + + return; + +/* End of STRSYL3 */ + +} /* strsyl3_ */ + diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f new file mode 100644 index 0000000000..28762c2ed1 --- /dev/null +++ b/lapack-netlib/SRC/strsyl3.f @@ -0,0 +1,1244 @@ +*> \brief \b STRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> STRSYL3 solves the real Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**T, and A and B are both upper quasi- +*> triangular. A is M-by-M and B is N-by-N; the right hand side C and +*> the solution X are M-by-N; and scale is an output scale factor, set +*> <= 1 to avoid overflow in X. +*> +*> A and B must be in Schur canonical form (as returned by SHSEQR), that +*> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; +*> each 2-by-2 diagonal block has its diagonal elements equal and its +*> off-diagonal elements of opposite sign. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'T': op(A) = A**T (Transpose) +*> = 'C': op(A) = A**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'T': op(B) = B**T (Transpose) +*> = 'C': op(B) = B**H (Conjugate transpose = Transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension (LDA,M) +*> The upper quasi-triangular matrix A, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> The upper quasi-triangular matrix B, in Schur canonical form. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is REAL +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] IWORK +*> \verbatim +*> IWORK is INTEGER array, dimension (MAX(1,LIWORK)) +*> On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. +*> \endverbatim +*> +*> \param[in] LIWORK +*> \verbatim +*> IWORK is INTEGER +*> The dimension of the array IWORK. LIWORK >= ((M + NB - 1) / NB + 1) +*> + ((N + NB - 1) / NB + 1), where NB is the optimal block size. +*> +*> If LIWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimension of the IWORK array, +*> returns this value as the first entry of the IWORK array, and +*> no error message related to LIWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is REAL array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK, + $ INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, M, N, + $ LIWORK, LDSWORK + REAL SCALE +* .. +* .. Array Arguments .. + INTEGER IWORK( * ) + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY, SKIP + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC + REAL ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM +* .. +* .. Local Arrays .. + REAL WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + REAL SLANGE, SLAMCH, SLARMM + EXTERNAL SLANGE, SLAMCH, SLARMM, ILAENV, LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLASCL, SSCAL, STRSYL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, EXPONENT, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 ) + IWORK( 1 ) = NBA + NBB + 2 + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK( 1, 1 ) = MAX( NBA, NBB ) + SWORK( 2, 1 ) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT. + $ LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT. + $ LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN + INFO = -14 + ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + INFO = -16 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspaces are provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR. + $ LIWORK.LT.IWORK(1) ) THEN + CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = SLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Partition A such that 2-by-2 blocks on the diagonal are not split +* + SKIP = .FALSE. + DO I = 1, NBA + IWORK( I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( NBA + 1 ) = M + 1 + DO K = 1, NBA + L1 = IWORK( K ) + L2 = IWORK( K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.M ) THEN +* A( M, M ) is a 1-by-1 block + CYCLE + END IF + IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN + IWORK( K + 1 ) = IWORK( K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( NBA + 1 ) = M + 1 + IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN + IWORK( NBA ) = IWORK( NBA + 1 ) + NBA = NBA - 1 + END IF +* +* Partition B such that 2-by-2 blocks on the diagonal are not split +* + PC = NBA + 1 + SKIP = .FALSE. + DO I = 1, NBB + IWORK( PC + I ) = ( I - 1 ) * NB + 1 + END DO + IWORK( PC + NBB + 1 ) = N + 1 + DO K = 1, NBB + L1 = IWORK( PC + K ) + L2 = IWORK( PC + K + 1 ) - 1 + DO L = L1, L2 + IF( SKIP ) THEN + SKIP = .FALSE. + CYCLE + END IF + IF( L.GE.N ) THEN +* B( N, N ) is a 1-by-1 block + CYCLE + END IF + IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN +* Check if 2-by-2 block is split + IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN + IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1 + CYCLE + END IF + SKIP = .TRUE. + END IF + END DO + END DO + IWORK( PC + NBB + 1 ) = N + 1 + IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN + IWORK( PC + NBB ) = IWORK( PC + NBB + 1 ) + NBB = NBB - 1 + END IF +* +* Set local scaling factors - must never attain zero. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = K, NBA + L1 = IWORK( L ) + L2 = IWORK( L + 1 ) + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = IWORK( PC + K ) + K2 = IWORK( PC + K + 1 ) + DO L = K, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = REAL( ISGN ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**T*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**T = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) +* + CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = IWORK( I ) + I2 = IWORK( I + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = SLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF (SCAL .NE. ONE) THEN + DO LL = L1, L2-1 + CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ ONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T +* + J1 = IWORK( PC + J ) + J2 = IWORK( PC + J + 1 ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = SLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.E0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ ONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO +* + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = IWORK( K ) + K2 = IWORK( K + 1 ) + DO L = 1, NBB + L1 = IWORK( PC + L ) + L2 = IWORK( PC + L + 1 ) + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF + + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = C( 1, 1 ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( C( K, L ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + IWORK(1) = NBA + NBB + 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of STRSYL3 +* + END diff --git a/lapack-netlib/SRC/strti2.c b/lapack-netlib/SRC/strti2.c index f83993539c..5ec1dedbf7 100644 --- a/lapack-netlib/SRC/strti2.c +++ b/lapack-netlib/SRC/strti2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strti2_(char *uplo, char *diag, integer *n, real *a, +/* Subroutine */ void strti2_(char *uplo, char *diag, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -632,11 +632,11 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); logical upper; - extern /* Subroutine */ int strmv_(char *, char *, char *, integer *, - real *, integer *, real *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void strmv_(char *, char *, char *, integer *, + real *, integer *, real *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; real ajj; @@ -673,7 +673,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRTI2", &i__1, (ftnlen)6); - return 0; + return; } if (upper) { @@ -723,7 +723,7 @@ f"> */ } } - return 0; + return; /* End of STRTI2 */ diff --git a/lapack-netlib/SRC/strtri.c b/lapack-netlib/SRC/strtri.c index 2e8b34dd37..96d618ebe3 100644 --- a/lapack-netlib/SRC/strtri.c +++ b/lapack-netlib/SRC/strtri.c @@ -626,7 +626,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strtri_(char *uplo, char *diag, integer *n, real *a, +/* Subroutine */ void strtri_(char *uplo, char *diag, integer *n, real *a, integer *lda, integer *info) { /* System generated locals */ @@ -638,13 +638,13 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int strmm_(char *, char *, char *, char *, + extern /* Subroutine */ void strmm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * ), strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer *); integer jb, nb; - extern /* Subroutine */ int strti2_(char *, char *, integer *, real *, + extern /* Subroutine */ void strti2_(char *, char *, integer *, real *, integer *, integer *); integer nn; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -685,13 +685,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity if non-unit. */ @@ -700,7 +700,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -782,7 +782,7 @@ f"> */ } } - return 0; + return; /* End of STRTRI */ diff --git a/lapack-netlib/SRC/strtrs.c b/lapack-netlib/SRC/strtrs.c index 5e82ac8254..3f6f90aa2e 100644 --- a/lapack-netlib/SRC/strtrs.c +++ b/lapack-netlib/SRC/strtrs.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void strtrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, real *a, integer *lda, real *b, integer *ldb, integer * info) { @@ -661,9 +661,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int strsm_(char *, char *, char *, char *, + extern /* Subroutine */ void strsm_(char *, char *, char *, char *, integer *, integer *, real *, real *, integer *, real *, integer * - ), xerbla_(char *, integer *, ftnlen); + ); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; @@ -708,13 +709,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -723,7 +724,7 @@ f"> */ i__1 = *n; for (*info = 1; *info <= i__1; ++(*info)) { if (a[*info + *info * a_dim1] == 0.f) { - return 0; + return; } /* L10: */ } @@ -735,7 +736,7 @@ f"> */ strsm_("Left", uplo, trans, diag, n, nrhs, &c_b12, &a[a_offset], lda, &b[ b_offset], ldb); - return 0; + return; /* End of STRTRS */ diff --git a/lapack-netlib/SRC/strttf.c b/lapack-netlib/SRC/strttf.c index e76a825848..41b9c60178 100644 --- a/lapack-netlib/SRC/strttf.c +++ b/lapack-netlib/SRC/strttf.c @@ -704,7 +704,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int strttf_(char *transr, char *uplo, integer *n, real *a, +/* Subroutine */ void strttf_(char *transr, char *uplo, integer *n, real *a, integer *lda, real *arf, integer *info) { /* System generated locals */ @@ -753,7 +753,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -762,7 +762,7 @@ f"> */ if (*n == 1) { arf[0] = a[0]; } - return 0; + return; } /* Size of array ARF(0:nt-1) */ @@ -1032,7 +1032,7 @@ f"> */ } - return 0; + return; /* End of STRTTF */ diff --git a/lapack-netlib/SRC/strttp.c b/lapack-netlib/SRC/strttp.c index c82c13c32f..f90fa3dbab 100644 --- a/lapack-netlib/SRC/strttp.c +++ b/lapack-netlib/SRC/strttp.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup realOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int strttp_(char *uplo, integer *n, real *a, integer *lda, +/* Subroutine */ void strttp_(char *uplo, integer *n, real *a, integer *lda, real *ap, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STRTTP", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -682,7 +682,7 @@ f"> */ } } - return 0; + return; /* End of STRTTP */ diff --git a/lapack-netlib/SRC/stzrzf.c b/lapack-netlib/SRC/stzrzf.c index ed87874b31..12f7324f76 100644 --- a/lapack-netlib/SRC/stzrzf.c +++ b/lapack-netlib/SRC/stzrzf.c @@ -667,7 +667,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int stzrzf_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void stzrzf_(integer *m, integer *n, real *a, integer *lda, real *tau, real *work, integer *lwork, integer *info) { /* System generated locals */ @@ -678,12 +678,12 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarzb_(char *, char *, char *, char *, + extern /* Subroutine */ void slarzb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, integer *, real *, integer *, real *, integer *); integer lwkmin, ldwork, lwkopt; logical lquery; - extern /* Subroutine */ int slarzt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void slarzt_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *), slatrz_(integer *, integer *, integer *, real *, integer *, real * , real *); @@ -742,22 +742,22 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("STZRZF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { tau[i__] = 0.f; /* L10: */ } - return 0; + return; } nbmin = 2; @@ -855,7 +855,7 @@ f"> */ work[1] = (real) lwkopt; - return 0; + return; /* End of STZRZF */ diff --git a/lapack-netlib/SRC/xerbla_array.c b/lapack-netlib/SRC/xerbla_array.c index 4ced919302..fe7d6d8984 100644 --- a/lapack-netlib/SRC/xerbla_array.c +++ b/lapack-netlib/SRC/xerbla_array.c @@ -599,7 +599,7 @@ array.f"> */ /* > \ingroup OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int xerbla_array_(char *srname_array__, integer * +/* Subroutine */ void xerbla_array_(char *srname_array__, integer * srname_len__, integer *info, integer srname_array_len) { /* System generated locals */ @@ -634,6 +634,6 @@ array.f"> */ } fprintf(stderr,"xerbla_array calling xerbla with srname #%s#\n",srname); xerbla_(srname, info, (ftnlen)strlen(srname)); - return 0; + return; } /* xerbla_array__ */ diff --git a/lapack-netlib/SRC/zbbcsd.c b/lapack-netlib/SRC/zbbcsd.c index 6941e3c0b4..4cef710609 100644 --- a/lapack-netlib/SRC/zbbcsd.c +++ b/lapack-netlib/SRC/zbbcsd.c @@ -844,7 +844,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void zbbcsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, integer *m, integer *p, integer *q, doublereal * theta, doublereal *phi, doublecomplex *u1, integer *ldu1, doublecomplex *u2, integer *ldu2, doublecomplex *v1t, integer *ldv1t, @@ -864,16 +864,16 @@ f"> */ logical colmajor; doublereal thetamin, thetamax; logical restart11, restart12, restart21, restart22; - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer iu1cs, iu2cs, iu1sn, iu2sn, i__, j; doublereal r__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer maxit; doublereal dummy; - extern /* Subroutine */ int zlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal x1, x2, y1, y2; @@ -884,12 +884,12 @@ f"> */ doublereal mu, nu, sigma11, sigma21; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal thresh, tolmul; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery; doublereal b11bulge; logical wantv1t, wantv2t; doublereal b12bulge, b21bulge, b22bulge, eps, tol; - extern /* Subroutine */ int dlartgp_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartgp_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *), dlartgs_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -963,7 +963,7 @@ f"> */ if (*info == 0 && *q == 0) { lrworkmin = 1; rwork[1] = (doublereal) lrworkmin; - return 0; + return; } /* Compute workspace */ @@ -988,9 +988,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZBBCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Get machine constants */ @@ -1082,7 +1082,7 @@ f"> */ ++(*info); } } - return 0; + return; } iter = iter + imax - imin; @@ -1796,7 +1796,7 @@ f"> */ } - return 0; + return; /* End of ZBBCSD */ diff --git a/lapack-netlib/SRC/zbdsqr.c b/lapack-netlib/SRC/zbdsqr.c index 59cd6788bd..35dac9e73b 100644 --- a/lapack-netlib/SRC/zbdsqr.c +++ b/lapack-netlib/SRC/zbdsqr.c @@ -737,7 +737,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zbdsqr_(char *uplo, integer *n, integer *ncvt, integer * +/* Subroutine */ void zbdsqr_(char *uplo, integer *n, integer *ncvt, integer * nru, integer *ncc, doublereal *d__, doublereal *e, doublecomplex *vt, integer *ldvt, doublecomplex *u, integer *ldu, doublecomplex *c__, integer *ldc, doublereal *rwork, integer *info) @@ -755,7 +755,7 @@ f"> */ doublereal cosl; integer isub, iter; doublereal unfl, sinl, cosr, smin, smax, sinr; - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal f, g, h__; integer i__, j, m; @@ -767,7 +767,7 @@ f"> */ integer maxit; doublereal sminl, sigmx; logical lower; - extern /* Subroutine */ int zlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *) , zswap_(integer *, doublecomplex *, integer *, doublecomplex *, @@ -779,9 +779,10 @@ f"> */ integer ll; extern doublereal dlamch_(char *); doublereal sn, mu; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, - doublereal *, doublereal *, doublereal *), xerbla_(char *, - integer *, ftnlen), zdscal_(integer *, doublereal *, + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, + doublereal *, doublereal *, doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal sminoa, thresh; logical rotate; @@ -839,10 +840,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZBDSQR", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } if (*n == 1) { goto L160; @@ -860,7 +861,7 @@ f"> */ /* If INFO equals 2, dqds didn't finish, try to finish */ if (*info != 2) { - return 0; + return; } *info = 0; } @@ -1484,7 +1485,7 @@ f"> */ /* L210: */ } L220: - return 0; + return; /* End of ZBDSQR */ diff --git a/lapack-netlib/SRC/zcgesv.c b/lapack-netlib/SRC/zcgesv.c index 0822c10696..095fa756d9 100644 --- a/lapack-netlib/SRC/zcgesv.c +++ b/lapack-netlib/SRC/zcgesv.c @@ -716,7 +716,7 @@ f"> */ /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zcgesv_(integer *n, integer *nrhs, doublecomplex *a, +/* Subroutine */ void zcgesv_(integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublecomplex *work, complex *swork, doublereal *rwork, integer *iter, integer *info) @@ -731,7 +731,7 @@ f"> */ integer ptsa; doublereal rnrm, xnrm; integer ptsx, i__, iiter; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, @@ -741,16 +741,18 @@ f"> */ doublecomplex *, integer *, complex *, integer *, integer *); extern doublereal dlamch_(char *); extern /* Subroutine */ int cgetrf_(integer *, integer *, complex *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); extern /* Subroutine */ int cgetrs_(char *, integer *, integer *, complex *, integer *, integer *, complex *, integer *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, - doublecomplex *, integer *, doublecomplex *, integer *), - zgetrf_(integer *, integer *, doublecomplex *, integer *, integer - *, integer *), zgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *); + extern int zgetrf_(integer *, integer *, doublecomplex *, integer *, integer + *, integer *); + extern int zgetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal cte, eps; @@ -807,13 +809,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZCGESV", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if (N.EQ.0). */ if (*n == 0) { - return 0; + return; } /* Skip single precision iterative refinement if a priori slower */ @@ -902,7 +904,7 @@ f"> */ /* stopping criterion. We are good to exit. */ *iter = 0; - return 0; + return; L10: @@ -965,7 +967,7 @@ f"> */ *iter = iiter; - return 0; + return; L20: @@ -988,14 +990,14 @@ f"> */ zgetrf_(n, n, &a[a_offset], lda, &ipiv[1], info); if (*info != 0) { - return 0; + return; } zlacpy_("All", n, nrhs, &b[b_offset], ldb, &x[x_offset], ldx); zgetrs_("No transpose", n, nrhs, &a[a_offset], lda, &ipiv[1], &x[x_offset] , ldx, info); - return 0; + return; /* End of ZCGESV. */ diff --git a/lapack-netlib/SRC/zcposv.c b/lapack-netlib/SRC/zcposv.c index f08d08f5a9..65ae2e4dd4 100644 --- a/lapack-netlib/SRC/zcposv.c +++ b/lapack-netlib/SRC/zcposv.c @@ -723,7 +723,7 @@ f"> */ /* > \ingroup complex16POsolve */ /* ===================================================================== */ -/* Subroutine */ int zcposv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zcposv_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublecomplex *work, complex *swork, doublereal *rwork, integer *iter, integer *info) @@ -740,7 +740,7 @@ f"> */ integer ptsx, i__; extern logical lsame_(char *, char *); integer iiter; - extern /* Subroutine */ int zhemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhemm_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlag2c_(integer *, @@ -754,11 +754,14 @@ f"> */ integer *, doublereal *); extern integer izamax_(integer *, doublecomplex *, integer *); extern /* Subroutine */ int cpotrf_(char *, integer *, complex *, integer - *, integer *), zlacpy_(char *, integer *, integer *, + *, integer *); + extern void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), cpotrs_(char *, integer *, integer *, complex *, integer *, - complex *, integer *, integer *), zpotrf_(char *, integer - *, doublecomplex *, integer *, integer *), zpotrs_(char *, + complex *, integer *, integer *); + extern int zpotrf_(char *, integer + *, doublecomplex *, integer *, integer *); + extern void zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex * , integer *, integer *); doublereal cte, eps; @@ -816,13 +819,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZCPOSV", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if (N.EQ.0). */ if (*n == 0) { - return 0; + return; } /* Skip single precision iterative refinement if a priori slower */ @@ -910,7 +913,7 @@ f"> */ /* stopping criterion. We are good to exit. */ *iter = 0; - return 0; + return; L10: @@ -972,7 +975,7 @@ f"> */ *iter = iiter; - return 0; + return; L20: @@ -995,13 +998,13 @@ f"> */ zpotrf_(uplo, n, &a[a_offset], lda, info); if (*info != 0) { - return 0; + return; } zlacpy_("All", n, nrhs, &b[b_offset], ldb, &x[x_offset], ldx); zpotrs_(uplo, n, nrhs, &a[a_offset], lda, &x[x_offset], ldx, info); - return 0; + return; /* End of ZCPOSV. */ diff --git a/lapack-netlib/SRC/zdrscl.c b/lapack-netlib/SRC/zdrscl.c index 8fdcd33c47..3173319e70 100644 --- a/lapack-netlib/SRC/zdrscl.c +++ b/lapack-netlib/SRC/zdrscl.c @@ -593,15 +593,15 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zdrscl_(integer *n, doublereal *sa, doublecomplex *sx, +/* Subroutine */ void zdrscl_(integer *n, doublereal *sa, doublecomplex *sx, integer *incx) { doublereal cden; logical done; doublereal cnum, cden1, cnum1; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal bignum, smlnum, mul; @@ -622,7 +622,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } /* Get machine parameters */ @@ -669,7 +669,7 @@ f"> */ goto L10; } - return 0; + return; /* End of ZDRSCL */ diff --git a/lapack-netlib/SRC/zgbbrd.c b/lapack-netlib/SRC/zgbbrd.c index c86383c206..e9008214a1 100644 --- a/lapack-netlib/SRC/zgbbrd.c +++ b/lapack-netlib/SRC/zgbbrd.c @@ -707,7 +707,7 @@ f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgbbrd_(char *vect, integer *m, integer *n, integer *ncc, +/* Subroutine */ void zgbbrd_(char *vect, integer *m, integer *n, integer *ncc, integer *kl, integer *ku, doublecomplex *ab, integer *ldab, doublereal *d__, doublereal *e, doublecomplex *q, integer *ldq, doublecomplex *pt, integer *ldpt, doublecomplex *c__, integer *ldc, @@ -721,13 +721,13 @@ f"> */ /* Local variables */ integer inca; doublereal abst; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); integer i__, j, l; doublecomplex t; extern logical lsame_(char *, char *); logical wantb, wantc; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer minmn; logical wantq; @@ -738,7 +738,7 @@ f"> */ doublecomplex rs; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer kb1; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *), zlargv_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -746,7 +746,7 @@ f"> */ integer ml0; logical wantpt; integer mu0; - extern /* Subroutine */ int zlartv_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlartv_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); integer klm, kun, nrt, klu1; @@ -812,7 +812,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBBRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and P**H to the unit matrix, if needed */ @@ -827,7 +827,7 @@ f"> */ /* Quick return if possible. */ if (*m == 0 || *n == 0) { - return 0; + return; } minmn = f2cmin(*m,*n); @@ -1236,7 +1236,7 @@ f"> */ } /* L120: */ } - return 0; + return; /* End of ZGBBRD */ diff --git a/lapack-netlib/SRC/zgbcon.c b/lapack-netlib/SRC/zgbcon.c index 4f1b1b3e87..5cac8cdef6 100644 --- a/lapack-netlib/SRC/zgbcon.c +++ b/lapack-netlib/SRC/zgbcon.c @@ -659,7 +659,7 @@ f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgbcon_(char *norm, integer *n, integer *kl, integer *ku, +/* Subroutine */ void zgbcon_(char *norm, integer *n, integer *kl, integer *ku, doublecomplex *ab, integer *ldab, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, doublereal *rwork, integer * info) @@ -678,7 +678,7 @@ f"> */ extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical lnoti; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlacn2_( integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); @@ -689,7 +689,7 @@ f"> */ doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); logical onenrm; - extern /* Subroutine */ int zlatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatbs_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *), zdrscl_(integer *, doublereal *, doublecomplex *, integer *); @@ -735,7 +735,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -743,9 +743,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -859,7 +859,7 @@ f"> */ } L40: - return 0; + return; /* End of ZGBCON */ diff --git a/lapack-netlib/SRC/zgbequ.c b/lapack-netlib/SRC/zgbequ.c index d3dfbfa8ac..7c5f38723b 100644 --- a/lapack-netlib/SRC/zgbequ.c +++ b/lapack-netlib/SRC/zgbequ.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgbequ_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void zgbequ_(integer *m, integer *n, integer *kl, integer *ku, doublecomplex *ab, integer *ldab, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer * info) @@ -714,7 +714,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -723,7 +723,7 @@ f"> */ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. */ @@ -784,7 +784,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -861,7 +861,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -884,7 +884,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of ZGBEQU */ diff --git a/lapack-netlib/SRC/zgbequb.c b/lapack-netlib/SRC/zgbequb.c index a442073ae0..11a14d641b 100644 --- a/lapack-netlib/SRC/zgbequb.c +++ b/lapack-netlib/SRC/zgbequb.c @@ -669,7 +669,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgbequb_(integer *m, integer *n, integer *kl, integer * +/* Subroutine */ void zgbequb_(integer *m, integer *n, integer *kl, integer * ku, doublecomplex *ab, integer *ldab, doublereal *r__, doublereal * c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer *info) @@ -721,7 +721,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -730,7 +730,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -800,7 +800,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -880,7 +880,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -903,7 +903,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of ZGBEQUB */ diff --git a/lapack-netlib/SRC/zgbrfs.c b/lapack-netlib/SRC/zgbrfs.c index 165f7f549c..a09132c430 100644 --- a/lapack-netlib/SRC/zgbrfs.c +++ b/lapack-netlib/SRC/zgbrfs.c @@ -718,7 +718,7 @@ f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgbrfs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void zgbrfs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex * afb, integer *ldafb, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, @@ -737,12 +737,12 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int zgbmv_(char *, integer *, integer *, integer * + extern /* Subroutine */ void zgbmv_(char *, integer *, integer *, integer * , integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer count; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlacn2_( integer *, doublecomplex *, doublecomplex *, doublereal *, @@ -756,7 +756,7 @@ f"> */ logical notran; char transn[1], transt[1]; doublereal lstres; - extern /* Subroutine */ int zgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -818,7 +818,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -830,7 +830,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1069,7 +1069,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZGBRFS */ diff --git a/lapack-netlib/SRC/zgbsv.c b/lapack-netlib/SRC/zgbsv.c index f7356516db..38a84cdcdc 100644 --- a/lapack-netlib/SRC/zgbsv.c +++ b/lapack-netlib/SRC/zgbsv.c @@ -673,7 +673,7 @@ e driver) */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgbsv_(integer *n, integer *kl, integer *ku, integer * +/* Subroutine */ void zgbsv_(integer *n, integer *kl, integer *ku, integer * nrhs, doublecomplex *ab, integer *ldab, integer *ipiv, doublecomplex * b, integer *ldb, integer *info) { @@ -681,7 +681,8 @@ e driver) */ integer ab_dim1, ab_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgbtrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgbtrf_( integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *), zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, @@ -726,7 +727,7 @@ e driver) */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the LU factorization of the band matrix A. */ @@ -739,7 +740,7 @@ e driver) */ zgbtrs_("No transpose", n, kl, ku, nrhs, &ab[ab_offset], ldab, &ipiv[ 1], &b[b_offset], ldb, info); } - return 0; + return; /* End of ZGBSV */ diff --git a/lapack-netlib/SRC/zgbsvx.c b/lapack-netlib/SRC/zgbsvx.c index 40cbfd0dd3..5e18ae34a9 100644 --- a/lapack-netlib/SRC/zgbsvx.c +++ b/lapack-netlib/SRC/zgbsvx.c @@ -881,7 +881,7 @@ f"> */ /* > \ingroup complex16GBsolve */ /* ===================================================================== */ -/* Subroutine */ int zgbsvx_(char *fact, char *trans, integer *n, integer *kl, +/* Subroutine */ void zgbsvx_(char *fact, char *trans, integer *n, integer *kl, integer *ku, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex *afb, integer *ldafb, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublecomplex *b, integer *ldb, @@ -903,19 +903,20 @@ f"> */ doublereal rcmin, rcmax, anorm; logical equil; integer j1, j2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); doublereal colcnd; logical nofact; extern doublereal zlangb_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlaqgb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlaqgb_( integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, char *); doublereal bignum; - extern /* Subroutine */ int zgbcon_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbcon_(char *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *); integer infequ; @@ -923,7 +924,7 @@ f"> */ extern doublereal zlantb_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal rowcnd; - extern /* Subroutine */ int zgbequ_(integer *, integer *, integer *, + extern /* Subroutine */ void zgbequ_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), zgbrfs_( char *, integer *, integer *, integer *, integer *, doublecomplex @@ -933,10 +934,10 @@ f"> */ integer *), zgbtrf_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *); logical notran; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); logical rowequ; @@ -1071,7 +1072,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1186,7 +1187,7 @@ f"> */ } rwork[1] = rpvgrw; *rcond = 0.; - return 0; + return; } } @@ -1280,7 +1281,7 @@ f"> */ } rwork[1] = rpvgrw; - return 0; + return; /* End of ZGBSVX */ diff --git a/lapack-netlib/SRC/zgbsvxx.c b/lapack-netlib/SRC/zgbsvxx.c index 30f3fc2310..ad6692aa06 100644 --- a/lapack-netlib/SRC/zgbsvxx.c +++ b/lapack-netlib/SRC/zgbsvxx.c @@ -1065,7 +1065,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16GBsolve */ /* ===================================================================== */ -/* Subroutine */ int zgbsvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void zgbsvxx_(char *fact, char *trans, integer *n, integer * kl, integer *ku, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex *afb, integer *ldafb, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublecomplex *b, integer *ldb, @@ -1091,7 +1091,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern doublereal dlamch_(char *); doublereal colcnd; logical nofact; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlaqgb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlaqgb_( integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, char *); @@ -1099,17 +1100,17 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer infequ; logical colequ; doublereal rowcnd; - extern /* Subroutine */ int zgbtrf_(integer *, integer *, integer *, + extern /* Subroutine */ void zgbtrf_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *); logical notran; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); logical rowequ; - extern /* Subroutine */ int zlascl2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void zlascl2_(integer *, integer *, doublereal *, doublecomplex *, integer *), zgbequb_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *) @@ -1258,7 +1259,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1334,7 +1335,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = zla_gbrpvgrw_(n, kl, ku, info, &ab[ab_offset], ldab, & afb[afb_offset], ldafb); - return 0; + return; } } @@ -1367,7 +1368,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zlascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of ZGBSVXX */ diff --git a/lapack-netlib/SRC/zgbtf2.c b/lapack-netlib/SRC/zgbtf2.c index 4711b23b2f..b15c12d869 100644 --- a/lapack-netlib/SRC/zgbtf2.c +++ b/lapack-netlib/SRC/zgbtf2.c @@ -660,7 +660,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgbtf2_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void zgbtf2_(integer *m, integer *n, integer *kl, integer *ku, doublecomplex *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -669,7 +669,7 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, @@ -717,13 +717,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Gaussian elimination with partial pivoting */ @@ -816,7 +816,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of ZGBTF2 */ diff --git a/lapack-netlib/SRC/zgbtrf.c b/lapack-netlib/SRC/zgbtrf.c index a9df0d4ae4..64b5d13048 100644 --- a/lapack-netlib/SRC/zgbtrf.c +++ b/lapack-netlib/SRC/zgbtrf.c @@ -659,7 +659,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgbtrf_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void zgbtrf_(integer *m, integer *n, integer *kl, integer *ku, doublecomplex *ab, integer *ldab, integer *ipiv, integer *info) { /* System generated locals */ @@ -669,7 +669,7 @@ f"> */ /* Local variables */ doublecomplex temp; integer i__, j; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -677,7 +677,7 @@ f"> */ doublecomplex work13[4160] /* was [65][64] */, work31[4160] /* was [65][64] */; integer i2, i3, j2, j3, k2; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, @@ -734,13 +734,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1151,7 +1151,7 @@ f"> */ } } - return 0; + return; /* End of ZGBTRF */ diff --git a/lapack-netlib/SRC/zgbtrs.c b/lapack-netlib/SRC/zgbtrs.c index f425a9514f..b744ab5b6b 100644 --- a/lapack-netlib/SRC/zgbtrs.c +++ b/lapack-netlib/SRC/zgbtrs.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgbtrs_(char *trans, integer *n, integer *kl, integer * +/* Subroutine */ void zgbtrs_(char *trans, integer *n, integer *kl, integer * ku, integer *nrhs, doublecomplex *ab, integer *ldab, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -663,7 +663,7 @@ f"> */ integer i__, j, l; extern logical lsame_(char *, char *); logical lnoti; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, @@ -672,7 +672,8 @@ f"> */ integer *), ztbsv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer kd, lm; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical notran; @@ -719,13 +720,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } kd = *ku + *kl + 1; @@ -840,7 +841,7 @@ f"> */ } } } - return 0; + return; /* End of ZGBTRS */ diff --git a/lapack-netlib/SRC/zgebak.c b/lapack-netlib/SRC/zgebak.c index 1aacc55bd6..131185aec5 100644 --- a/lapack-netlib/SRC/zgebak.c +++ b/lapack-netlib/SRC/zgebak.c @@ -639,7 +639,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgebak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void zgebak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, doublereal *scale, integer *m, doublecomplex *v, integer *ldv, integer *info) { @@ -651,10 +651,11 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); logical leftv; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ii; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); logical rightv; @@ -700,19 +701,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -791,7 +792,7 @@ f"> */ } } - return 0; + return; /* End of ZGEBAK */ diff --git a/lapack-netlib/SRC/zgebak.f b/lapack-netlib/SRC/zgebak.f index 9ec610efb4..9a0f65a439 100644 --- a/lapack-netlib/SRC/zgebak.f +++ b/lapack-netlib/SRC/zgebak.f @@ -238,7 +238,7 @@ SUBROUTINE ZGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 40 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 40 CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) @@ -252,7 +252,7 @@ SUBROUTINE ZGEBAK( JOB, SIDE, N, ILO, IHI, SCALE, M, V, LDV, $ GO TO 50 IF( I.LT.ILO ) $ I = ILO - II - K = SCALE( I ) + K = INT( SCALE( I ) ) IF( K.EQ.I ) $ GO TO 50 CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV ) diff --git a/lapack-netlib/SRC/zgebal.c b/lapack-netlib/SRC/zgebal.c index 719cdf5868..289e9c9a58 100644 --- a/lapack-netlib/SRC/zgebal.c +++ b/lapack-netlib/SRC/zgebal.c @@ -675,7 +675,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgebal_(char *job, integer *n, doublecomplex *a, integer +/* Subroutine */ void zgebal_(char *job, integer *n, doublecomplex *a, integer *lda, integer *ilo, integer *ihi, doublereal *scale, integer *info) { /* System generated locals */ @@ -688,14 +688,15 @@ f"> */ integer i__, j, k, l, m; doublereal r__, s; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal sfmin1, sfmin2, sfmax1, sfmax2, ca; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); doublereal ra; extern doublereal dlamch_(char *); extern logical disnan_(doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); logical noconv; @@ -732,7 +733,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEBAL", &i__1, (ftnlen)6); - return 0; + return; } k = 1; @@ -898,7 +899,7 @@ f"> */ *info = -3; i__2 = -(*info); xerbla_("ZGEBAL", &i__2, (ftnlen)6); - return 0; + return; } f *= 2.; c__ *= 2.; @@ -960,7 +961,7 @@ f"> */ *ilo = k; *ihi = l; - return 0; + return; /* End of ZGEBAL */ diff --git a/lapack-netlib/SRC/zgebd2.c b/lapack-netlib/SRC/zgebd2.c index 556839b3b2..17282ba0bb 100644 --- a/lapack-netlib/SRC/zgebd2.c +++ b/lapack-netlib/SRC/zgebd2.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgebd2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgebd2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *d__, doublereal *e, doublecomplex *tauq, doublecomplex *taup, doublecomplex *work, integer *info) { @@ -713,9 +713,11 @@ f"> */ /* Local variables */ integer i__; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *); @@ -753,7 +755,7 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("ZGEBD2", &i__1, (ftnlen)6); - return 0; + return; } if (*m >= *n) { @@ -899,7 +901,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of ZGEBD2 */ diff --git a/lapack-netlib/SRC/zgebrd.c b/lapack-netlib/SRC/zgebrd.c index 7561c8a929..cc2d94b650 100644 --- a/lapack-netlib/SRC/zgebrd.c +++ b/lapack-netlib/SRC/zgebrd.c @@ -721,7 +721,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgebrd_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgebrd_(integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *d__, doublereal *e, doublecomplex *tauq, doublecomplex *taup, doublecomplex *work, integer *lwork, integer * info) @@ -733,14 +733,15 @@ f"> */ /* Local variables */ integer i__, j, nbmin, iinfo, minmn; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zgebd2_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer nb, nx, ws; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlabrd_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlabrd_( integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -797,9 +798,9 @@ f"> */ if (*info < 0) { i__1 = -(*info); xerbla_("ZGEBRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -807,7 +808,7 @@ f"> */ minmn = f2cmin(*m,*n); if (minmn == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } ws = f2cmax(*m,*n); @@ -912,7 +913,7 @@ f"> */ zgebd2_(&i__2, &i__1, &a[i__ + i__ * a_dim1], lda, &d__[i__], &e[i__], & tauq[i__], &taup[i__], &work[1], &iinfo); work[1].r = (doublereal) ws, work[1].i = 0.; - return 0; + return; /* End of ZGEBRD */ diff --git a/lapack-netlib/SRC/zgecon.c b/lapack-netlib/SRC/zgecon.c index 27e4f784d8..d83bcf93f4 100644 --- a/lapack-netlib/SRC/zgecon.c +++ b/lapack-netlib/SRC/zgecon.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgecon_(char *norm, integer *n, doublecomplex *a, +/* Subroutine */ void zgecon_(char *norm, integer *n, doublecomplex *a, integer *lda, doublereal *anorm, doublereal *rcond, doublecomplex * work, doublereal *rwork, integer *info) { @@ -649,7 +649,7 @@ f"> */ doublereal scale; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); doublereal sl; @@ -659,11 +659,11 @@ f"> */ doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); logical onenrm; - extern /* Subroutine */ int zdrscl_(integer *, doublereal *, + extern /* Subroutine */ void zdrscl_(integer *, doublereal *, doublecomplex *, integer *); char normin[1]; doublereal smlnum; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *); @@ -701,7 +701,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGECON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -709,9 +709,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -776,7 +776,7 @@ f"> */ } L20: - return 0; + return; /* End of ZGECON */ diff --git a/lapack-netlib/SRC/zgecon.f b/lapack-netlib/SRC/zgecon.f index 3d3127f9df..9cbfe35bcd 100644 --- a/lapack-netlib/SRC/zgecon.f +++ b/lapack-netlib/SRC/zgecon.f @@ -106,6 +106,7 @@ *> INFO is INTEGER *> = 0: successful exit *> < 0: if INFO = -i, the i-th argument had an illegal value +*> =-5: if ANORM is NAN or negative. *> \endverbatim * * Authors: @@ -153,10 +154,10 @@ SUBROUTINE ZGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, INTEGER ISAVE( 3 ) * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN INTEGER IZAMAX DOUBLE PRECISION DLAMCH - EXTERNAL LSAME, IZAMAX, DLAMCH + EXTERNAL LSAME, IZAMAX, DLAMCH, DISNAN * .. * .. External Subroutines .. EXTERNAL XERBLA, ZDRSCL, ZLACN2, ZLATRS @@ -182,7 +183,7 @@ SUBROUTINE ZGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK, INFO = -2 ELSE IF( LDA.LT.MAX( 1, N ) ) THEN INFO = -4 - ELSE IF( ANORM.LT.ZERO ) THEN + ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN INFO = -5 END IF IF( INFO.NE.0 ) THEN diff --git a/lapack-netlib/SRC/zgeequ.c b/lapack-netlib/SRC/zgeequ.c index 06805f478c..b48f175c66 100644 --- a/lapack-netlib/SRC/zgeequ.c +++ b/lapack-netlib/SRC/zgeequ.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgeequ_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeequ_(integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer *info) { @@ -694,7 +694,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -703,7 +703,7 @@ f"> */ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. */ @@ -759,7 +759,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -830,7 +830,7 @@ f"> */ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -853,7 +853,7 @@ f"> */ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of ZGEEQU */ diff --git a/lapack-netlib/SRC/zgeequb.c b/lapack-netlib/SRC/zgeequb.c index d5f1df3cbf..6b35c89941 100644 --- a/lapack-netlib/SRC/zgeequb.c +++ b/lapack-netlib/SRC/zgeequb.c @@ -655,7 +655,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgeequb_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeequb_(integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, integer *info) { @@ -701,7 +701,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -710,7 +710,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rowcnd = 1.; *colcnd = 1.; *amax = 0.; - return 0; + return; } /* Get machine constants. Assume SMLNUM is a power of the radix. */ @@ -775,7 +775,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (r__[i__] == 0.) { *info = i__; - return 0; + return; } /* L50: */ } @@ -850,7 +850,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (j = 1; j <= i__1; ++j) { if (c__[j] == 0.) { *info = *m + j; - return 0; + return; } /* L110: */ } @@ -873,7 +873,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *colcnd = f2cmax(rcmin,smlnum) / f2cmin(rcmax,bignum); } - return 0; + return; /* End of ZGEEQUB */ diff --git a/lapack-netlib/SRC/zgees.c b/lapack-netlib/SRC/zgees.c index 8476e3e3fb..5044173297 100644 --- a/lapack-netlib/SRC/zgees.c +++ b/lapack-netlib/SRC/zgees.c @@ -710,7 +710,7 @@ or GE matrices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgees_(char *jobvs, char *sort, L_fp select, integer *n, +/* Subroutine */ void zgees_(char *jobvs, char *sort, L_fp select, integer *n, doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, doublecomplex *vs, integer *ldvs, doublecomplex *work, integer *lwork, doublereal *rwork, logical *bwork, integer *info) @@ -725,22 +725,23 @@ or GE matrices */ doublereal s; integer icond, ieval; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); logical scalea; extern doublereal dlamch_(char *); doublereal cscale; - extern /* Subroutine */ int zgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgebak_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublecomplex *, integer *, integer *), zgebal_(char *, integer *, doublecomplex *, integer *, integer *, integer *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void zgehrd_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, @@ -748,15 +749,15 @@ or GE matrices */ integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer minwrk, maxwrk; doublereal smlnum; - extern /* Subroutine */ int zhseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhseqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer hswork; - extern /* Subroutine */ int zunghr_(integer *, integer *, integer *, + extern /* Subroutine */ void zunghr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); logical wantst, lquery, wantvs; - extern /* Subroutine */ int ztrsen_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ztrsen_(char *, char *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, integer *); @@ -848,16 +849,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEES ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -975,7 +976,7 @@ or GE matrices */ } work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGEES */ diff --git a/lapack-netlib/SRC/zgees.f b/lapack-netlib/SRC/zgees.f index 40fe78d345..d673087bfb 100644 --- a/lapack-netlib/SRC/zgees.f +++ b/lapack-netlib/SRC/zgees.f @@ -282,7 +282,7 @@ SUBROUTINE ZGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, W, VS, * CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = DBLE( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/zgeesx.c b/lapack-netlib/SRC/zgeesx.c index 72a245610f..a04f05d9fc 100644 --- a/lapack-netlib/SRC/zgeesx.c +++ b/lapack-netlib/SRC/zgeesx.c @@ -751,7 +751,7 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgeesx_(char *jobvs, char *sort, L_fp select, char * +/* Subroutine */ void zgeesx_(char *jobvs, char *sort, L_fp select, char * sense, integer *n, doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, doublecomplex *vs, integer *ldvs, doublereal * rconde, doublereal *rcondv, doublecomplex *work, integer *lwork, @@ -765,43 +765,44 @@ f"> */ doublereal anrm; integer ierr, itau, iwrk, lwrk, i__, icond, ieval; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); logical scalea; extern doublereal dlamch_(char *); doublereal cscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), zgebak_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublecomplex *, integer *, integer *), zgebal_(char *, integer *, doublecomplex *, integer *, integer *, integer *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void zgehrd_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); logical wantsb, wantse; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer minwrk, maxwrk; logical wantsn; doublereal smlnum; - extern /* Subroutine */ int zhseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhseqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer hswork; - extern /* Subroutine */ int zunghr_(integer *, integer *, integer *, + extern /* Subroutine */ void zunghr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); logical wantst, lquery, wantsv, wantvs; - extern /* Subroutine */ int ztrsen_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ztrsen_(char *, char *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, integer *); @@ -910,16 +911,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1058,7 +1059,7 @@ f"> */ } work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGEESX */ diff --git a/lapack-netlib/SRC/zgeesx.f b/lapack-netlib/SRC/zgeesx.f index ca4f5c9135..bdd741b113 100644 --- a/lapack-netlib/SRC/zgeesx.f +++ b/lapack-netlib/SRC/zgeesx.f @@ -337,7 +337,7 @@ SUBROUTINE ZGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, W, * CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS, $ WORK, -1, IEVAL ) - HSWORK = DBLE( WORK( 1 ) ) + HSWORK = INT( WORK( 1 ) ) * IF( .NOT.WANTVS ) THEN MAXWRK = MAX( MAXWRK, HSWORK ) diff --git a/lapack-netlib/SRC/zgeev.c b/lapack-netlib/SRC/zgeev.c index df74979fcb..7034c7c5b9 100644 --- a/lapack-netlib/SRC/zgeev.c +++ b/lapack-netlib/SRC/zgeev.c @@ -694,7 +694,7 @@ ices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgeev_(char *jobvl, char *jobvr, integer *n, +/* Subroutine */ void zgeev_(char *jobvl, char *jobvr, integer *n, doublecomplex *a, integer *lda, doublecomplex *w, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) @@ -711,13 +711,13 @@ ices */ doublereal anrm; integer ierr, itau, iwrk, nout, i__, k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); logical scalea; extern doublereal dlamch_(char *); doublereal cscale; - extern /* Subroutine */ int zgebak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgebak_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublecomplex *, integer *, integer *), zgebal_(char *, integer *, doublecomplex *, integer *, integer *, integer *, doublereal *, @@ -727,12 +727,12 @@ ices */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical select[1]; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void zgehrd_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, @@ -742,14 +742,14 @@ ices */ logical wantvl; doublereal smlnum; integer hswork, irwork; - extern /* Subroutine */ int zhseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhseqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunghr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); logical lquery, wantvr; integer ihi; - extern /* Subroutine */ int ztrevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ztrevc3_(char *, char *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *); @@ -870,15 +870,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1115,7 +1115,7 @@ ices */ } work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGEEV */ diff --git a/lapack-netlib/SRC/zgeevx.c b/lapack-netlib/SRC/zgeevx.c index d63ca45870..874b56e1d6 100644 --- a/lapack-netlib/SRC/zgeevx.c +++ b/lapack-netlib/SRC/zgeevx.c @@ -801,7 +801,7 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgeevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void zgeevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, doublecomplex *a, integer *lda, doublecomplex *w, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, integer *ilo, integer *ihi, doublereal *scale, doublereal *abnrm, @@ -819,13 +819,13 @@ f"> */ doublereal anrm; integer ierr, itau, iwrk, nout, i__, k, icond; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); logical scalea; extern doublereal dlamch_(char *); doublereal cscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), zgebak_(char *, char *, integer *, integer *, integer *, doublereal *, integer *, doublecomplex *, @@ -837,12 +837,12 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical select[1]; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void zgehrd_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, @@ -853,11 +853,11 @@ f"> */ integer hswork; logical wntsne; doublereal smlnum; - extern /* Subroutine */ int zhseqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhseqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); logical lquery, wantvr; - extern /* Subroutine */ int ztrsna_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ztrsna_(char *, char *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *), zunghr_(integer *, integer *, integer *, @@ -865,7 +865,7 @@ f"> */ integer *, integer *); logical wntsnn, wntsnv; char job[1]; - extern /* Subroutine */ int ztrevc3_(char *, char *, logical *, integer *, + extern /* Subroutine */ void ztrevc3_(char *, char *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *); @@ -1023,15 +1023,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1291,7 +1291,7 @@ f"> */ } work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGEEVX */ diff --git a/lapack-netlib/SRC/zgehd2.c b/lapack-netlib/SRC/zgehd2.c index 52005e008a..5ce4b504ff 100644 --- a/lapack-netlib/SRC/zgehd2.c +++ b/lapack-netlib/SRC/zgehd2.c @@ -663,7 +663,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgehd2_(integer *n, integer *ilo, integer *ihi, +/* Subroutine */ void zgehd2_(integer *n, integer *ilo, integer *ihi, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *info) { @@ -674,9 +674,11 @@ f"> */ /* Local variables */ integer i__; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -712,7 +714,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEHD2", &i__1, (ftnlen)6); - return 0; + return; } i__1 = *ihi - 1; @@ -749,7 +751,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of ZGEHD2 */ diff --git a/lapack-netlib/SRC/zgehrd.c b/lapack-netlib/SRC/zgehrd.c index 751f5597f9..bb1bdc0db7 100644 --- a/lapack-netlib/SRC/zgehrd.c +++ b/lapack-netlib/SRC/zgehrd.c @@ -685,7 +685,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgehrd_(integer *n, integer *ilo, integer *ihi, +/* Subroutine */ void zgehrd_(integer *n, integer *ilo, integer *ihi, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -695,7 +695,7 @@ f"> */ /* Local variables */ integer i__, j, nbmin, iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrmm_(char *, char *, char *, char *, @@ -713,7 +713,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -770,9 +770,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEHRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */ @@ -795,7 +795,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Determine the block size */ @@ -912,7 +912,7 @@ f"> */ zgehd2_(n, &i__, ihi, &a[a_offset], lda, &tau[1], &work[1], &iinfo); work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZGEHRD */ diff --git a/lapack-netlib/SRC/zgejsv.c b/lapack-netlib/SRC/zgejsv.c index 21549d4005..7158ff8d71 100644 --- a/lapack-netlib/SRC/zgejsv.c +++ b/lapack-netlib/SRC/zgejsv.c @@ -1087,7 +1087,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgejsv_(char *joba, char *jobu, char *jobv, char *jobr, +/* Subroutine */ void zgejsv_(char *joba, char *jobu, char *jobv, char *jobr, char *jobt, char *jobp, integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *sva, doublecomplex *u, integer *ldu, doublecomplex *v, integer *ldv, doublecomplex *cwork, integer *lwork, @@ -1109,7 +1109,7 @@ f"> */ doublereal temp1; integer lwunmqrm, lwqp3, p, q; logical jracc; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer lwrk_zgesvju__, lwrk_zgesvjv__; extern logical lsame_(char *, char *); @@ -1122,23 +1122,23 @@ f"> */ doublereal epsln; logical rsvec; integer lwcon, lwlqf, lwqrf, n1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical l2aber; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal condr1, condr2, uscal1, uscal2; logical l2kill, l2rank, l2tran, l2pert; - extern /* Subroutine */ int zgeqp3_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqp3_(integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, doublecomplex *, integer * , doublereal *, integer *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); integer lrwqp3; extern doublereal dlamch_(char *); integer nr; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); @@ -1147,47 +1147,47 @@ f"> */ doublereal aatmin, aatmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical noscal; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *), zlacgv_(integer *, doublecomplex *, integer *), dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal entrat; logical almort; doublecomplex cdummy[1]; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal maxprj; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); logical errest; integer lrwcon; - extern /* Subroutine */ int zlapmr_(logical *, integer *, integer *, + extern /* Subroutine */ void zlapmr_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); logical transp; integer minwrk, lwsvdj; - extern /* Subroutine */ int zpocon_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zpocon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *), zgesvj_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *); doublereal rdummy[1]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); logical lquery; extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); logical rowpiv; integer optwrk; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -1792,13 +1792,13 @@ f"> */ /* #:( */ i__1 = -(*info); xerbla_("ZGEJSV", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { cwork[1].r = (doublereal) optwrk, cwork[1].i = 0.; cwork[2].r = (doublereal) minwrk, cwork[2].i = 0.; rwork[1] = (doublereal) minrwrk; iwork[1] = f2cmax(4,miniwrk); - return 0; + return; } /* Quick return for void matrix (Y3K safe) */ @@ -1815,7 +1815,7 @@ f"> */ rwork[5] = 0.; rwork[6] = 0.; rwork[7] = 0.; - return 0; + return; } /* Determine whether the matrix U should be M x N or M x M */ @@ -1855,7 +1855,7 @@ f"> */ *info = -9; i__2 = -(*info); xerbla_("ZGEJSV", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscal) { @@ -1917,7 +1917,7 @@ f"> */ iwork[2] = 0; iwork[3] = 0; iwork[4] = -1; - return 0; + return; } /* Issue warning if denormalized column norms detected. Override the */ @@ -1984,7 +1984,7 @@ f"> */ rwork[6] = 0.; rwork[7] = 0.; } - return 0; + return; } @@ -3554,6 +3554,6 @@ f"> */ iwork[4] = -1; } - return 0; + return; } /* zgejsv_ */ diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f index 0c2226f9f0..5134ea8912 100644 --- a/lapack-netlib/SRC/zgejsv.f +++ b/lapack-netlib/SRC/zgejsv.f @@ -304,7 +304,7 @@ *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance, *> LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB, -*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQ, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF, *> ZUNMLQ. In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(ZGEQP3), N+LWORK(ZGESVJ), *> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). @@ -313,7 +313,7 @@ *> -> the minimal requirement is LWORK >= 3*N. *> -> For optimal performance, *> LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB, -*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQ, +*> where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF, *> ZUNMLQ. In general, the optimal length LWORK is computed as *> LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ), *> N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)). @@ -349,7 +349,7 @@ *> *> \param[out] RWORK *> \verbatim -*> RWORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK)) +*> RWORK is DOUBLE PRECISION array, dimension (MAX(7,LRWORK)) *> On exit, *> RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1) *> such that SCALE*SVA(1:N) are the computed singular values @@ -707,11 +707,11 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_ZGEQP3 = DBLE( CDUMMY(1) ) + LWRK_ZGEQP3 = INT( CDUMMY(1) ) CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_ZGEQRF = DBLE( CDUMMY(1) ) + LWRK_ZGEQRF = INT( CDUMMY(1) ) CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR ) - LWRK_ZGELQF = DBLE( CDUMMY(1) ) + LWRK_ZGELQF = INT( CDUMMY(1) ) END IF MINWRK = 2 OPTWRK = 2 @@ -727,7 +727,7 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, $ LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, $ N+LWRK_ZGEQRF, LWRK_ZGESVJ ) @@ -763,10 +763,10 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMLQ = DBLE( CDUMMY(1) ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, $ N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF, @@ -802,10 +802,10 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A, $ LDA, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF, $ LWRK_ZGESVJ, LWRK_ZUNMQRM ) @@ -864,26 +864,26 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, IF ( LQUERY ) THEN CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQR = DBLE( CDUMMY(1) ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) IF ( .NOT. JRACC ) THEN CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1, $ RDUMMY, IERR ) - LWRK_ZGEQP3N = DBLE( CDUMMY(1) ) + LWRK_ZGEQP3N = INT( CDUMMY(1) ) CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJ = DBLE( CDUMMY(1) ) + LWRK_ZGESVJ = INT( CDUMMY(1) ) CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJU = DBLE( CDUMMY(1) ) + LWRK_ZGESVJU = INT( CDUMMY(1) ) CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJV = DBLE( CDUMMY(1) ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMLQ = DBLE( CDUMMY(1) ) + LWRK_ZUNMLQ = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, $ 2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, @@ -912,13 +912,13 @@ SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP, ELSE CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA, $ N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR ) - LWRK_ZGESVJV = DBLE( CDUMMY(1) ) + LWRK_ZGESVJV = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY, $ V, LDV, CDUMMY, -1, IERR ) - LWRK_ZUNMQR = DBLE( CDUMMY(1) ) + LWRK_ZUNMQR = INT( CDUMMY(1) ) CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U, $ LDU, CDUMMY, -1, IERR ) - LWRK_ZUNMQRM = DBLE( CDUMMY(1) ) + LWRK_ZUNMQRM = INT( CDUMMY(1) ) IF ( ERREST ) THEN OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, $ 2*N+LWRK_ZGEQRF, 2*N+N**2, diff --git a/lapack-netlib/SRC/zgelq.c b/lapack-netlib/SRC/zgelq.c index 6bab268694..f876c6ed0d 100644 --- a/lapack-netlib/SRC/zgelq.c +++ b/lapack-netlib/SRC/zgelq.c @@ -681,7 +681,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgelq_(integer *m, integer *n, doublecomplex *a, integer +/* Subroutine */ void zgelq_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *t, integer *tsize, doublecomplex *work, integer * lwork, integer *info) { @@ -694,12 +694,12 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgelqt_(integer *, integer *, integer *, + extern /* Subroutine */ void zgelqt_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical lminws, lquery; integer mintsz; - extern /* Subroutine */ int zlaswlq_(integer *, integer *, integer *, + extern /* Subroutine */ void zlaswlq_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -840,15 +840,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("ZGELQ", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -862,7 +862,7 @@ static integer c__2 = 2; work[1].r = (doublereal) lwreq, work[1].i = 0.; - return 0; + return; /* End of ZGELQ */ diff --git a/lapack-netlib/SRC/zgelq2.c b/lapack-netlib/SRC/zgelq2.c index ec413aaed5..b9dc8893bb 100644 --- a/lapack-netlib/SRC/zgelq2.c +++ b/lapack-netlib/SRC/zgelq2.c @@ -639,7 +639,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgelq2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgelq2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *info) { /* System generated locals */ @@ -648,9 +648,11 @@ f"> */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *); @@ -685,7 +687,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -721,7 +723,7 @@ f"> */ zlacgv_(&i__2, &a[i__ + i__ * a_dim1], lda); /* L10: */ } - return 0; + return; /* End of ZGELQ2 */ diff --git a/lapack-netlib/SRC/zgelqf.c b/lapack-netlib/SRC/zgelqf.c index a850007d6e..4373e46168 100644 --- a/lapack-netlib/SRC/zgelqf.c +++ b/lapack-netlib/SRC/zgelqf.c @@ -659,7 +659,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgelqf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgelqf_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -668,18 +668,18 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int zgelq2_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelq2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer ib, nb, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -724,9 +724,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -734,7 +734,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -819,7 +819,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZGELQF */ diff --git a/lapack-netlib/SRC/zgelqt.c b/lapack-netlib/SRC/zgelqt.c index 8af0c94f6b..e9bc1f8559 100644 --- a/lapack-netlib/SRC/zgelqt.c +++ b/lapack-netlib/SRC/zgelqt.c @@ -648,7 +648,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgelqt_(integer *m, integer *n, integer *mb, +/* Subroutine */ void zgelqt_(integer *m, integer *n, integer *mb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *info) { @@ -657,7 +657,8 @@ f"> */ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfb_( char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgelqt3_(integer *, integer *, @@ -701,14 +702,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -737,7 +738,7 @@ f"> */ i__ * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of ZGELQT */ diff --git a/lapack-netlib/SRC/zgelqt3.c b/lapack-netlib/SRC/zgelqt3.c index 6b5b95ee3c..6732698c17 100644 --- a/lapack-netlib/SRC/zgelqt3.c +++ b/lapack-netlib/SRC/zgelqt3.c @@ -645,7 +645,7 @@ ompact WY representation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgelqt3_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgelqt3_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, integer *info) { /* System generated locals */ @@ -654,15 +654,16 @@ ompact WY representation of Q. */ /* Local variables */ integer i__, j, iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i1, j1, m1, m2; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -697,7 +698,7 @@ ompact WY representation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELQT3", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 1) { @@ -811,7 +812,7 @@ ompact WY representation of Q. */ } - return 0; + return; /* End of ZGELQT3 */ diff --git a/lapack-netlib/SRC/zgels.c b/lapack-netlib/SRC/zgels.c index c8723aefab..fdc1da60ae 100644 --- a/lapack-netlib/SRC/zgels.c +++ b/lapack-netlib/SRC/zgels.c @@ -697,7 +697,7 @@ static integer c__0 = 0; /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zgels_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void zgels_(char *trans, integer *m, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -713,7 +713,7 @@ static integer c__0 = 0; extern logical lsame_(char *, char *); integer wsize; doublereal rwork[1]; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer nb; extern doublereal dlamch_(char *); integer mn; @@ -724,7 +724,7 @@ static integer c__0 = 0; doublereal bignum; extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *, @@ -733,11 +733,12 @@ static integer c__0 = 0; doublecomplex *, integer *); doublereal smlnum; logical lquery; - extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, integer *), ztrtrs_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *); + extern int ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -840,9 +841,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("ZGELS ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -852,7 +853,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -939,7 +940,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; @@ -954,7 +955,7 @@ static integer c__0 = 0; a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = ZERO */ @@ -1002,7 +1003,7 @@ static integer c__0 = 0; , lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1047,7 +1048,7 @@ static integer c__0 = 0; a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1077,7 +1078,7 @@ static integer c__0 = 0; d__1 = (doublereal) wsize; work[1].r = d__1, work[1].i = 0.; - return 0; + return; /* End of ZGELS */ diff --git a/lapack-netlib/SRC/zgelsd.c b/lapack-netlib/SRC/zgelsd.c index cefb148109..f101cb3f0c 100644 --- a/lapack-netlib/SRC/zgelsd.c +++ b/lapack-netlib/SRC/zgelsd.c @@ -744,7 +744,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int zgelsd_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void zgelsd_(integer *m, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *s, doublereal *rcond, integer *rank, doublecomplex *work, integer *lwork, doublereal *rwork, integer *iwork, integer *info) @@ -757,15 +757,16 @@ f"> */ integer itau, nlvl, iascl, ibscl; doublereal sfmin; integer minmn, maxmn, itaup, itauq, mnthr, nwork; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer ie, il; extern doublereal dlamch_(char *); integer mm; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer - *, doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), zgebrd_(integer *, integer *, + *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zgebrd_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, integer *); @@ -774,7 +775,7 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlalsd_(char *, integer *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *, @@ -783,20 +784,20 @@ f"> */ integer *, integer *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer ldwork; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer liwork, minwrk, maxwrk; doublereal smlnum; - extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmbr_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ); integer lrwork; logical lquery; integer nrwork, smlsiz; - extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -1002,16 +1003,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters. */ @@ -1307,7 +1308,7 @@ f"> */ work[1].r = (doublereal) maxwrk, work[1].i = 0.; iwork[1] = liwork; rwork[1] = (doublereal) lrwork; - return 0; + return; /* End of ZGELSD */ diff --git a/lapack-netlib/SRC/zgelss.c b/lapack-netlib/SRC/zgelss.c index c13d2a0e55..573963478a 100644 --- a/lapack-netlib/SRC/zgelss.c +++ b/lapack-netlib/SRC/zgelss.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zgelss_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void zgelss_(integer *m, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *s, doublereal *rcond, integer *rank, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) @@ -712,25 +712,26 @@ f"> */ chunk, lwork_zunmqr__; doublereal sfmin; integer minmn; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer maxmn, itaup, itauq, mnthr; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer iwork; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); integer bl, ie, il; extern doublereal dlamch_(char *); integer mm; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dlaset_(char *, integer *, integer - *, doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), zgebrd_(integer *, integer *, + *, doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zgebrd_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, integer *); @@ -739,14 +740,14 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zdrscl_( integer *, doublereal *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zbdsqr_( @@ -754,17 +755,17 @@ f"> */ doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *); integer minwrk, maxwrk; - extern /* Subroutine */ int zungbr_(char *, integer *, integer *, integer + extern /* Subroutine */ void zungbr_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal smlnum; integer irwork; - extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmbr_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ); logical lquery; - extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -969,16 +970,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELSS", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1437,7 +1438,7 @@ f"> */ } L70: work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGELSS */ diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f index e4aba64970..be53ba95b1 100644 --- a/lapack-netlib/SRC/zgelss.f +++ b/lapack-netlib/SRC/zgelss.f @@ -266,11 +266,11 @@ SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * * Compute space needed for ZGEQRF CALL ZGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO ) - LWORK_ZGEQRF = DBLE( DUM(1) ) + LWORK_ZGEQRF = INT( DUM(1) ) * Compute space needed for ZUNMQR CALL ZUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B, $ LDB, DUM(1), -1, INFO ) - LWORK_ZUNMQR = DBLE( DUM(1) ) + LWORK_ZUNMQR = INT( DUM(1) ) MM = N MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'ZGEQRF', ' ', M, $ N, -1, -1 ) ) @@ -284,15 +284,15 @@ SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for ZGEBRD CALL ZGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1), $ -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', N, N, N, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) * Compute total workspace needed MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZGEBRD ) MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZUNMBR ) @@ -310,23 +310,23 @@ SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for ZGELQF CALL ZGELQF( M, N, A, LDA, DUM(1), DUM(1), $ -1, INFO ) - LWORK_ZGELQF = DBLE( DUM(1) ) + LWORK_ZGELQF = INT( DUM(1) ) * Compute space needed for ZGEBRD CALL ZGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', M, M, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) * Compute space needed for ZUNMLQ CALL ZUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1), $ B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMLQ = DBLE( DUM(1) ) + LWORK_ZUNMLQ = INT( DUM(1) ) * Compute total workspace needed MAXWRK = M + LWORK_ZGELQF MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_ZGEBRD ) @@ -345,15 +345,15 @@ SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK, * Compute space needed for ZGEBRD CALL ZGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZGEBRD = DBLE( DUM(1) ) + LWORK_ZGEBRD = INT( DUM(1) ) * Compute space needed for ZUNMBR CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA, $ DUM(1), B, LDB, DUM(1), -1, INFO ) - LWORK_ZUNMBR = DBLE( DUM(1) ) + LWORK_ZUNMBR = INT( DUM(1) ) * Compute space needed for ZUNGBR CALL ZUNGBR( 'P', M, N, M, A, LDA, DUM(1), $ DUM(1), -1, INFO ) - LWORK_ZUNGBR = DBLE( DUM(1) ) + LWORK_ZUNGBR = INT( DUM(1) ) MAXWRK = 2*M + LWORK_ZGEBRD MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNMBR ) MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNGBR ) diff --git a/lapack-netlib/SRC/zgelst.c b/lapack-netlib/SRC/zgelst.c new file mode 100644 index 0000000000..88e0dcc9cc --- /dev/null +++ b/lapack-netlib/SRC/zgelst.c @@ -0,0 +1,1116 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori +zation with compact WY representation of Q. */ + +/* =========== DOCUMENTATION =========== */ + +/* Online html documentation available at */ +/* http://www.netlib.org/lapack/explore-html/ */ + +/* > \htmlonly */ +/* > Download ZGELST + dependencies */ +/* > */ +/* > [TGZ] */ +/* > */ +/* > [ZIP] */ +/* > */ +/* > [TXT] */ +/* > \endhtmlonly */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */ +/* INFO ) */ + +/* CHARACTER TRANS */ +/* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS */ +/* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZGELST solves overdetermined or underdetermined real linear systems */ +/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */ +/* > or LQ factorization of A with compact WY representation of Q. */ +/* > It is assumed that A has full rank. */ +/* > */ +/* > The following options are provided: */ +/* > */ +/* > 1. If TRANS = 'N' and m >= n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A*X ||. */ +/* > */ +/* > 2. If TRANS = 'N' and m < n: find the minimum norm solution of */ +/* > an underdetermined system A * X = B. */ +/* > */ +/* > 3. If TRANS = 'C' and m >= n: find the minimum norm solution of */ +/* > an underdetermined system A**T * X = B. */ +/* > */ +/* > 4. If TRANS = 'C' and m < n: find the least squares solution of */ +/* > an overdetermined system, i.e., solve the least squares problem */ +/* > minimize || B - A**T * X ||. */ +/* > */ +/* > Several right hand side vectors b and solution vectors x can be */ +/* > handled in a single call; they are stored as the columns of the */ +/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */ +/* > matrix X. */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > = 'N': the linear system involves A; */ +/* > = 'C': the linear system involves A**H. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The number of rows of the matrix A. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The number of columns of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of right hand sides, i.e., the number of */ +/* > columns of the matrices B and X. NRHS >=0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > On entry, the M-by-N matrix A. */ +/* > On exit, */ +/* > if M >= N, A is overwritten by details of its QR */ +/* > factorization as returned by ZGEQRT; */ +/* > if M < N, A is overwritten by details of its LQ */ +/* > factorization as returned by ZGELQT. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,NRHS) */ +/* > On entry, the matrix B of right hand side vectors, stored */ +/* > columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */ +/* > if TRANS = 'C'. */ +/* > On exit, if INFO = 0, B is overwritten by the solution */ +/* > vectors, stored columnwise: */ +/* > if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */ +/* > squares solution vectors; the residual sum of squares for the */ +/* > solution in each column is given by the sum of squares of */ +/* > modulus of elements N+1 to M in that column; */ +/* > if TRANS = 'N' and m < n, rows 1 to N of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m >= n, rows 1 to M of B contain the */ +/* > minimum norm solution vectors; */ +/* > if TRANS = 'C' and m < n, rows 1 to M of B contain the */ +/* > least squares solution vectors; the residual sum of squares */ +/* > for the solution in each column is given by the sum of */ +/* > squares of the modulus of elements M+1 to N in that column. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= MAX(1,M,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > \verbatim */ +/* > LWORK is INTEGER */ +/* > The dimension of the array WORK. */ +/* > LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */ +/* > For optimal performance, */ +/* > LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */ +/* > where MN = f2cmin(M,N) and NB is the optimum block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal size of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > > 0: if INFO = i, the i-th diagonal element of the */ +/* > triangular factor of A is zero, so that A does not have */ +/* > full rank; the least squares solution could not be */ +/* > computed. */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup complex16GEsolve */ + +/* > \par Contributors: */ +/* ================== */ +/* > */ +/* > \verbatim */ +/* > */ +/* > November 2022, Igor Kozachenko, */ +/* > Computer Science Division, */ +/* > University of California, Berkeley */ +/* > \endverbatim */ + +/* ===================================================================== */ +/* Subroutine */ void zgelst_(char *trans, integer *m, integer *n, integer * + nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, + doublecomplex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3; + doublereal d__1; + + /* Local variables */ + doublereal anrm, bnrm; + integer brow; + logical tpsd; + integer i__, j, iascl, ibscl; + extern logical lsame_(char *, char *); + integer nbmin; + doublereal rwork[1]; + integer lwopt; + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); + integer nb; + extern doublereal dlamch_(char *); + integer mn; + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + integer scllen; + doublereal bignum; + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex *, + integer *, integer *), zlaset_(char *, integer *, + integer *, doublecomplex *, doublecomplex *, doublecomplex *, + integer *); + integer mnnrhs; + extern /* Subroutine */ void zgelqt_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + doublereal smlnum; + extern /* Subroutine */ void zgeqrt_(integer *, integer *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + logical lquery; + extern /* Subroutine */ int ztrtrs_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + integer *); + extern void zgemlqt_(char *, char *, + integer *, integer *, integer *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *), zgemqrt_(char *, + char *, integer *, integer *, integer *, integer *, doublecomplex + *, integer *, doublecomplex *, integer *, doublecomplex *, + integer *, doublecomplex *, integer *); + + +/* -- LAPACK driver routine -- */ +/* -- LAPACK is a software package provided by Univ. of Tennessee, -- */ +/* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */ + + +/* ===================================================================== */ + + +/* Test the input arguments. */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + --work; + + /* Function Body */ + *info = 0; + mn = f2cmin(*m,*n); + lquery = *lwork == -1; + if (! (lsame_(trans, "N") || lsame_(trans, "C"))) { + *info = -1; + } else if (*m < 0) { + *info = -2; + } else if (*n < 0) { + *info = -3; + } else if (*nrhs < 0) { + *info = -4; + } else if (*lda < f2cmax(1,*m)) { + *info = -6; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = f2cmax(1,*m); + if (*ldb < f2cmax(i__1,*n)) { + *info = -8; + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs); + if (*lwork < f2cmax(i__1,i__2) && ! lquery) { + *info = -10; + } + } + } + +/* Figure out optimal block size and optimal workspace size */ + + if (*info == 0 || *info == -10) { + + tpsd = TRUE_; + if (lsame_(trans, "N")) { + tpsd = FALSE_; + } + + nb = ilaenv_(&c__1, "ZGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + + mnnrhs = f2cmax(mn,*nrhs); +/* Computing MAX */ + i__1 = 1, i__2 = (mn + mnnrhs) * nb; + lwopt = f2cmax(i__1,i__2); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + + } + + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZGELST ", &i__1, 6); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + +/* Computing MIN */ + i__1 = f2cmin(*m,*n); + if (f2cmin(i__1,*nrhs) == 0) { + i__1 = f2cmax(*m,*n); + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + return; + } + +/* *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */ + + if (nb > mn) { + nb = mn; + } + +/* Determine the block size from the supplied LWORK */ +/* ( at this stage we know that LWORK >= (minimum required workspace, */ +/* but it may be less than optimal) */ + +/* Computing MIN */ + i__1 = nb, i__2 = *lwork / (mn + mnnrhs); + nb = f2cmin(i__1,i__2); + +/* The minimum value of NB, when blocked code is used */ + +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "ZGELST", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nbmin = f2cmax(i__1,i__2); + + if (nb < nbmin) { + nb = 1; + } + +/* Get machine parameters */ + + smlnum = dlamch_("S") / dlamch_("P"); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + +/* Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */ + + anrm = zlange_("M", m, n, &a[a_offset], lda, rwork); + iascl = 0; + if (anrm > 0. && anrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = f2cmax(*m,*n); + zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + return; + } + + brow = *m; + if (tpsd) { + brow = *n; + } + bnrm = zlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork); + ibscl = 0; + if (bnrm > 0. && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM */ + + zlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM */ + + zlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], + ldb, info); + ibscl = 2; + } + + if (*m >= *n) { + +/* M > N: */ +/* Compute the blocked QR factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least N, optimally N*NB. */ + + zgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M > N, A is not transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A * X - B ||. */ + +/* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + +/* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */ + + ztrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *n; + + } else { + +/* M > N, A is transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A**T * X = B. */ + +/* Compute B := inv(R**T) * B in two row blocks of B. */ + +/* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */ + + ztrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the N-th row in B: */ +/* B(N+1:M,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *m; + for (i__ = *n + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0., b[i__3].i = 0.; + } + } + +/* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + + scllen = *m; + + } + + } else { + +/* M < N: */ +/* Compute the blocked LQ factorization of A, */ +/* using the compact WY representation of Q, */ +/* workspace at least M, optimally M*NB. */ + + zgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + + 1], info); + + if (! tpsd) { + +/* M < N, A is not transposed: */ +/* Underdetermined system of equations, */ +/* minimum norm solution of A * X = B. */ + +/* Compute B := inv(L) * B in two row blocks of B. */ + +/* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */ + + ztrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset] + , lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + +/* Block 2: Zero out all rows below the M-th row in B: */ +/* B(M+1:N,1:NRHS) = ZERO */ + + i__1 = *nrhs; + for (j = 1; j <= i__1; ++j) { + i__2 = *n; + for (i__ = *m + 1; i__ <= i__2; ++i__) { + i__3 = i__ + j * b_dim1; + b[i__3].r = 0., b[i__3].i = 0.; + } + } + +/* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[ + a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[ + mn * nb + 1], info); + + scllen = *n; + + } else { + +/* M < N, A is transposed: */ +/* Overdetermined system of equations, */ +/* least-squares problem, f2cmin || A**T * X - B ||. */ + +/* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */ +/* using the compact WY representation of Q, */ +/* workspace at least NRHS, optimally NRHS*NB. */ + + zgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], + lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], + info); + +/* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */ + + ztrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[ + a_offset], lda, &b[b_offset], ldb, info); + + if (*info > 0) { + return; + } + + scllen = *m; + + } + + } + +/* Undo scaling */ + + if (iascl == 1) { + zlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (iascl == 2) { + zlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + if (ibscl == 1) { + zlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } else if (ibscl == 2) { + zlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset] + , ldb, info); + } + + d__1 = (doublereal) lwopt; + work[1].r = d__1, work[1].i = 0.; + + return; + +/* End of ZGELST */ + +} /* zgelst_ */ + diff --git a/lapack-netlib/SRC/zgelst.f b/lapack-netlib/SRC/zgelst.f new file mode 100644 index 0000000000..4dabdc91e6 --- /dev/null +++ b/lapack-netlib/SRC/zgelst.f @@ -0,0 +1,533 @@ +*> \brief ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGELST + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, +* INFO ) +* +* .. Scalar Arguments .. +* CHARACTER TRANS +* INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGELST solves overdetermined or underdetermined real linear systems +*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR +*> or LQ factorization of A with compact WY representation of Q. +*> It is assumed that A has full rank. +*> +*> The following options are provided: +*> +*> 1. If TRANS = 'N' and m >= n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A*X ||. +*> +*> 2. If TRANS = 'N' and m < n: find the minimum norm solution of +*> an underdetermined system A * X = B. +*> +*> 3. If TRANS = 'C' and m >= n: find the minimum norm solution of +*> an underdetermined system A**T * X = B. +*> +*> 4. If TRANS = 'C' and m < n: find the least squares solution of +*> an overdetermined system, i.e., solve the least squares problem +*> minimize || B - A**T * X ||. +*> +*> Several right hand side vectors b and solution vectors x can be +*> handled in a single call; they are stored as the columns of the +*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution +*> matrix X. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': the linear system involves A; +*> = 'C': the linear system involves A**H. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of right hand sides, i.e., the number of +*> columns of the matrices B and X. NRHS >=0. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> On entry, the M-by-N matrix A. +*> On exit, +*> if M >= N, A is overwritten by details of its QR +*> factorization as returned by ZGEQRT; +*> if M < N, A is overwritten by details of its LQ +*> factorization as returned by ZGELQT. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,NRHS) +*> On entry, the matrix B of right hand side vectors, stored +*> columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS +*> if TRANS = 'C'. +*> On exit, if INFO = 0, B is overwritten by the solution +*> vectors, stored columnwise: +*> if TRANS = 'N' and m >= n, rows 1 to n of B contain the least +*> squares solution vectors; the residual sum of squares for the +*> solution in each column is given by the sum of squares of +*> modulus of elements N+1 to M in that column; +*> if TRANS = 'N' and m < n, rows 1 to N of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m >= n, rows 1 to M of B contain the +*> minimum norm solution vectors; +*> if TRANS = 'C' and m < n, rows 1 to M of B contain the +*> least squares solution vectors; the residual sum of squares +*> for the solution in each column is given by the sum of +*> squares of the modulus of elements M+1 to N in that column. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= MAX(1,M,N). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> LWORK is INTEGER +*> The dimension of the array WORK. +*> LWORK >= max( 1, MN + max( MN, NRHS ) ). +*> For optimal performance, +*> LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ). +*> where MN = min(M,N) and NB is the optimum block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal size of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> > 0: if INFO = i, the i-th diagonal element of the +*> triangular factor of A is zero, so that A does not have +*> full rank; the least squares solution could not be +*> computed. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16GEsolve +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2022, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> \endverbatim +* +* ===================================================================== + SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, + $ INFO ) +* +* -- LAPACK driver routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, LWORK, M, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, TPSD + INTEGER BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS, + $ NB, NBMIN, SCLLEN + DOUBLE PRECISION ANRM, BIGNUM, BNRM, SMLNUM +* .. +* .. Local Arrays .. + DOUBLE PRECISION RWORK( 1 ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL LSAME, ILAENV, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZGELQT, ZGEQRT, ZGEMLQT, ZGEMQRT, DLABAD, + $ ZLASCL, ZLASET, ZTRTRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments. +* + INFO = 0 + MN = MIN( M, N ) + LQUERY = ( LWORK.EQ.-1 ) + IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN + INFO = -1 + ELSE IF( M.LT.0 ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY ) + $ THEN + INFO = -10 + END IF +* +* Figure out optimal block size and optimal workspace size +* + IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN +* + TPSD = .TRUE. + IF( LSAME( TRANS, 'N' ) ) + $ TPSD = .FALSE. +* + NB = ILAENV( 1, 'ZGELST', ' ', M, N, -1, -1 ) +* + MNNRHS = MAX( MN, NRHS ) + LWOPT = MAX( 1, (MN+MNNRHS)*NB ) + WORK( 1 ) = DBLE( LWOPT ) +* + END IF +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGELST ', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N, NRHS ).EQ.0 ) THEN + CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* +* *GEQRT and *GELQT routines cannot accept NB larger than min(M,N) +* + IF( NB.GT.MN ) NB = MN +* +* Determine the block size from the supplied LWORK +* ( at this stage we know that LWORK >= (minimum required workspace, +* but it may be less than optimal) +* + NB = MIN( NB, LWORK/( MN + MNNRHS ) ) +* +* The minimum value of NB, when blocked code is used +* + NBMIN = MAX( 2, ILAENV( 2, 'ZGELST', ' ', M, N, -1, -1 ) ) +* + IF( NB.LT.NBMIN ) THEN + NB = 1 + END IF +* +* Get machine parameters +* + SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' ) + BIGNUM = ONE / SMLNUM + CALL DLABAD( SMLNUM, BIGNUM ) +* +* Scale A, B if max element outside range [SMLNUM,BIGNUM] +* + ANRM = ZLANGE( 'M', M, N, A, LDA, RWORK ) + IASCL = 0 + IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) + IASCL = 1 + ELSE IF( ANRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) + IASCL = 2 + ELSE IF( ANRM.EQ.ZERO ) THEN +* +* Matrix all zero. Return zero solution. +* + CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB ) + WORK( 1 ) = DBLE( LWOPT ) + RETURN + END IF +* + BROW = M + IF( TPSD ) + $ BROW = N + BNRM = ZLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) + IBSCL = 0 + IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN +* +* Scale matrix norm up to SMLNUM +* + CALL ZLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 1 + ELSE IF( BNRM.GT.BIGNUM ) THEN +* +* Scale matrix norm down to BIGNUM +* + CALL ZLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, + $ INFO ) + IBSCL = 2 + END IF +* + IF( M.GE.N ) THEN +* +* M > N: +* Compute the blocked QR factorization of A, +* using the compact WY representation of Q, +* workspace at least N, optimally N*NB. +* + CALL ZGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M > N, A is not transposed: +* Overdetermined system of equations, +* least-squares problem, min || A * X - B ||. +* +* Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* +* Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) +* + CALL ZTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = N +* + ELSE +* +* M > N, A is transposed: +* Underdetermined system of equations, +* minimum norm solution of A**T * X = B. +* +* Compute B := inv(R**T) * B in two row blocks of B. +* +* Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) +* + CALL ZTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit', + $ N, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the N-th row in B: +* B(N+1:M,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = N + 1, M + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = M +* + END IF +* + ELSE +* +* M < N: +* Compute the blocked LQ factorization of A, +* using the compact WY representation of Q, +* workspace at least M, optimally M*NB. +* + CALL ZGELQT( M, N, NB, A, LDA, WORK( 1 ), NB, + $ WORK( MN*NB+1 ), INFO ) +* + IF( .NOT.TPSD ) THEN +* +* M < N, A is not transposed: +* Underdetermined system of equations, +* minimum norm solution of A * X = B. +* +* Compute B := inv(L) * B in two row blocks of B. +* +* Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) +* + CALL ZTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS, + $ A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* +* Block 2: Zero out all rows below the M-th row in B: +* B(M+1:N,1:NRHS) = ZERO +* + DO J = 1, NRHS + DO I = M + 1, N + B( I, J ) = ZERO + END DO + END DO +* +* Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1 ), INFO ) +* + SCLLEN = N +* + ELSE +* +* M < N, A is transposed: +* Overdetermined system of equations, +* least-squares problem, min || A**T * X - B ||. +* +* Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), +* using the compact WY representation of Q, +* workspace at least NRHS, optimally NRHS*NB. +* + CALL ZGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB, + $ A, LDA, WORK( 1 ), NB, B, LDB, + $ WORK( MN*NB+1), INFO ) +* +* Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) +* + CALL ZTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit', + $ M, NRHS, A, LDA, B, LDB, INFO ) +* + IF( INFO.GT.0 ) THEN + RETURN + END IF +* + SCLLEN = M +* + END IF +* + END IF +* +* Undo scaling +* + IF( IASCL.EQ.1 ) THEN + CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IASCL.EQ.2 ) THEN + CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF + IF( IBSCL.EQ.1 ) THEN + CALL ZLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + ELSE IF( IBSCL.EQ.2 ) THEN + CALL ZLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB, + $ INFO ) + END IF +* + WORK( 1 ) = DBLE( LWOPT ) +* + RETURN +* +* End of ZGELST +* + END diff --git a/lapack-netlib/SRC/zgelsy.c b/lapack-netlib/SRC/zgelsy.c index ef3242ab11..673ea76c89 100644 --- a/lapack-netlib/SRC/zgelsy.c +++ b/lapack-netlib/SRC/zgelsy.c @@ -727,7 +727,7 @@ f"> */ /* > G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgelsy_(integer *m, integer *n, integer *nrhs, +/* Subroutine */ void zgelsy_(integer *m, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *jpvt, doublereal *rcond, integer *rank, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) @@ -743,7 +743,7 @@ f"> */ doublecomplex c1, c2; doublereal wsize; doublecomplex s1, s2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char * , integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), @@ -762,16 +762,16 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer nb1, nb2, nb3, nb4; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal sminpr, smaxpr, smlnum; integer lwkopt; logical lquery; - extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmrz_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -853,9 +853,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGELSY", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -864,7 +864,7 @@ f"> */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*nrhs) == 0) { *rank = 0; - return 0; + return; } /* Get machine parameters */ @@ -1081,7 +1081,7 @@ f"> */ z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGELSY */ diff --git a/lapack-netlib/SRC/zgemlq.c b/lapack-netlib/SRC/zgemlq.c index 6ecb8e0a06..dc1616d7ab 100644 --- a/lapack-netlib/SRC/zgemlq.c +++ b/lapack-netlib/SRC/zgemlq.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgemlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zgemlq_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *t, integer *tsize, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -683,7 +683,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int zlamswlq_(char *, char *, integer *, integer * + extern /* Subroutine */ void zlamswlq_(char *, char *, integer *, integer * , integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -692,7 +692,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int zgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemlqt_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -774,9 +774,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -784,7 +784,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -800,7 +800,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1].r = (doublereal) lw, work[1].i = 0.; - return 0; + return; /* End of ZGEMLQ */ diff --git a/lapack-netlib/SRC/zgemlqt.c b/lapack-netlib/SRC/zgemlqt.c index ed9d9db352..2487a49b03 100644 --- a/lapack-netlib/SRC/zgemlqt.c +++ b/lapack-netlib/SRC/zgemlqt.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup doubleGEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgemlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zgemlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *mb, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) @@ -691,7 +691,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfb_( char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -756,12 +757,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -822,7 +823,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of ZGEMLQT */ diff --git a/lapack-netlib/SRC/zgemqr.c b/lapack-netlib/SRC/zgemqr.c index 24744b593f..600c94145a 100644 --- a/lapack-netlib/SRC/zgemqr.c +++ b/lapack-netlib/SRC/zgemqr.c @@ -675,7 +675,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgemqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zgemqr_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *t, integer *tsize, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -685,7 +685,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ logical left, tran; - extern /* Subroutine */ int zlamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void zlamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -694,7 +694,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer mb, nb, mn, lw, nblcks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; - extern /* Subroutine */ int zgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemqrt_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -776,9 +776,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -802,7 +802,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ work[1].r = (doublereal) lw, work[1].i = 0.; - return 0; + return; /* End of ZGEMQR */ diff --git a/lapack-netlib/SRC/zgemqrt.c b/lapack-netlib/SRC/zgemqrt.c index 7b1f9fd70c..0ac74d4c5a 100644 --- a/lapack-netlib/SRC/zgemqrt.c +++ b/lapack-netlib/SRC/zgemqrt.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgemqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zgemqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *nb, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) @@ -691,7 +691,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); logical right; integer ib, kf; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfb_( char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -758,12 +759,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -824,7 +825,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of ZGEMQRT */ diff --git a/lapack-netlib/SRC/zgeql2.c b/lapack-netlib/SRC/zgeql2.c index 5040dc9a56..b62412cfe2 100644 --- a/lapack-netlib/SRC/zgeql2.c +++ b/lapack-netlib/SRC/zgeql2.c @@ -637,7 +637,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeql2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeql2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *info) { /* System generated locals */ @@ -647,9 +647,11 @@ f"> */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -683,7 +685,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQL2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -712,7 +714,7 @@ f"> */ a[i__1].r = alpha.r, a[i__1].i = alpha.i; /* L10: */ } - return 0; + return; /* End of ZGEQL2 */ diff --git a/lapack-netlib/SRC/zgeqlf.c b/lapack-netlib/SRC/zgeqlf.c index 9ad9262d58..b562fc9a26 100644 --- a/lapack-netlib/SRC/zgeqlf.c +++ b/lapack-netlib/SRC/zgeqlf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqlf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqlf_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -663,18 +663,18 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int zgeql2_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeql2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer ib, nb, ki, kk, mu, nu, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -730,15 +730,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQLF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -830,7 +830,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZGEQLF */ diff --git a/lapack-netlib/SRC/zgeqp3.c b/lapack-netlib/SRC/zgeqp3.c index 36a1460bed..f2b957e7aa 100644 --- a/lapack-netlib/SRC/zgeqp3.c +++ b/lapack-netlib/SRC/zgeqp3.c @@ -674,7 +674,7 @@ f"> */ /* > X. Sun, Computer Science Dept., Duke University, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqp3_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqp3_(integer *m, integer *n, doublecomplex *a, integer *lda, integer *jpvt, doublecomplex *tau, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) { @@ -684,7 +684,7 @@ f"> */ /* Local variables */ integer nfxd, j, nbmin, minmn, minws; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaqp2_(integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, doublereal *, doublereal *, doublecomplex *); @@ -694,17 +694,17 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); integer topbmn, sminmn; - extern /* Subroutine */ int zlaqps_(integer *, integer *, integer *, + extern /* Subroutine */ void zlaqps_(integer *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer fjb, iws; @@ -764,9 +764,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQP3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Move initial columns up front. */ @@ -922,7 +922,7 @@ f"> */ z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGEQP3 */ diff --git a/lapack-netlib/SRC/zgeqr.c b/lapack-netlib/SRC/zgeqr.c index 11577befbe..93af2ac2bb 100644 --- a/lapack-netlib/SRC/zgeqr.c +++ b/lapack-netlib/SRC/zgeqr.c @@ -683,7 +683,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqr_(integer *m, integer *n, doublecomplex *a, integer +/* Subroutine */ void zgeqr_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *t, integer *tsize, doublecomplex *work, integer * lwork, integer *info) { @@ -697,12 +697,12 @@ static integer c__2 = 2; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical lminws; - extern /* Subroutine */ int zgeqrt_(integer *, integer *, integer *, + extern /* Subroutine */ void zgeqrt_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical lquery; integer mintsz; - extern /* Subroutine */ int zlatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zlatsqr_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -831,15 +831,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQR", &i__1, (ftnlen)5); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -856,7 +856,7 @@ static integer c__2 = 2; i__1 = f2cmax(i__2,i__3); work[1].r = (doublereal) i__1, work[1].i = 0.; - return 0; + return; /* End of ZGEQR */ diff --git a/lapack-netlib/SRC/zgeqr2.c b/lapack-netlib/SRC/zgeqr2.c index ae4783eaee..cf368ad46c 100644 --- a/lapack-netlib/SRC/zgeqr2.c +++ b/lapack-netlib/SRC/zgeqr2.c @@ -644,7 +644,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqr2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqr2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *info) { /* System generated locals */ @@ -654,9 +654,11 @@ f"> */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -690,7 +692,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQR2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -723,7 +725,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of ZGEQR2 */ diff --git a/lapack-netlib/SRC/zgeqr2p.c b/lapack-netlib/SRC/zgeqr2p.c index 38901a4f97..4e409ec782 100644 --- a/lapack-netlib/SRC/zgeqr2p.c +++ b/lapack-netlib/SRC/zgeqr2p.c @@ -648,7 +648,7 @@ l elements using an unblocked algorithm. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqr2p_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqr2p_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *info) { /* System generated locals */ @@ -658,9 +658,11 @@ l elements using an unblocked algorithm. */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfgp_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfgp_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -694,7 +696,7 @@ l elements using an unblocked algorithm. */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQR2P", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -727,7 +729,7 @@ l elements using an unblocked algorithm. */ } /* L10: */ } - return 0; + return; /* End of ZGEQR2P */ diff --git a/lapack-netlib/SRC/zgeqrf.c b/lapack-netlib/SRC/zgeqrf.c index 9bfe34428d..9f2facb994 100644 --- a/lapack-netlib/SRC/zgeqrf.c +++ b/lapack-netlib/SRC/zgeqrf.c @@ -661,7 +661,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqrf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqrf_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -670,18 +670,18 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int zgeqr2_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqr2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer ib, nb, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -726,9 +726,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -736,7 +736,7 @@ f"> */ k = f2cmin(*m,*n); if (k == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -821,7 +821,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZGEQRF */ diff --git a/lapack-netlib/SRC/zgeqrfp.c b/lapack-netlib/SRC/zgeqrfp.c index 0f2a955439..5432a10a3b 100644 --- a/lapack-netlib/SRC/zgeqrfp.c +++ b/lapack-netlib/SRC/zgeqrfp.c @@ -665,7 +665,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqrfp_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqrfp_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -677,17 +677,17 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int zgeqr2p_(integer *, integer *, doublecomplex * + extern /* Subroutine */ void zgeqr2p_(integer *, integer *, doublecomplex * , integer *, doublecomplex *, doublecomplex *, integer *); integer iws; @@ -729,9 +729,9 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQRFP", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -739,7 +739,7 @@ static integer c__2 = 2; k = f2cmin(*m,*n); if (k == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -824,7 +824,7 @@ static integer c__2 = 2; } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZGEQRFP */ diff --git a/lapack-netlib/SRC/zgeqrt.c b/lapack-netlib/SRC/zgeqrt.c index 005d01be11..c3919be55c 100644 --- a/lapack-netlib/SRC/zgeqrt.c +++ b/lapack-netlib/SRC/zgeqrt.c @@ -650,7 +650,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqrt_(integer *m, integer *n, integer *nb, +/* Subroutine */ void zgeqrt_(integer *m, integer *n, integer *nb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *info) { @@ -659,7 +659,8 @@ f"> */ /* Local variables */ integer i__, k, iinfo, ib; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfb_( char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgeqrt2_(integer *, integer *, @@ -704,14 +705,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ k = f2cmin(*m,*n); if (k == 0) { - return 0; + return; } /* Blocked loop of length K */ @@ -746,7 +747,7 @@ f"> */ ib) * a_dim1], lda, &work[1], &i__5); } } - return 0; + return; /* End of ZGEQRT */ diff --git a/lapack-netlib/SRC/zgeqrt2.c b/lapack-netlib/SRC/zgeqrt2.c index b394114786..847db1b822 100644 --- a/lapack-netlib/SRC/zgeqrt2.c +++ b/lapack-netlib/SRC/zgeqrt2.c @@ -643,7 +643,7 @@ presentation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqrt2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqrt2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, integer *info) { /* System generated locals */ @@ -653,14 +653,15 @@ presentation of Q. */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrmv_(char *, char *, char *, integer *, doublecomplex *, - integer *, doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, + integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublecomplex aii; @@ -698,7 +699,7 @@ presentation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQRT2", &i__1, (ftnlen)7); - return 0; + return; } k = f2cmin(*m,*n); @@ -780,6 +781,6 @@ presentation of Q. */ /* End of ZGEQRT2 */ - return 0; + return; } /* zgeqrt2_ */ diff --git a/lapack-netlib/SRC/zgeqrt3.c b/lapack-netlib/SRC/zgeqrt3.c index d9e957d6cf..020558deb9 100644 --- a/lapack-netlib/SRC/zgeqrt3.c +++ b/lapack-netlib/SRC/zgeqrt3.c @@ -647,7 +647,7 @@ ompact WY representation of Q. */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgeqrt3_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgeqrt3_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, integer *info) { /* System generated locals */ @@ -656,15 +656,16 @@ ompact WY representation of Q. */ /* Local variables */ integer i__, j, iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i1, j1, n1, n2; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -699,7 +700,7 @@ ompact WY representation of Q. */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGEQRT3", &i__1, (ftnlen)7); - return 0; + return; } if (*n == 1) { @@ -809,7 +810,7 @@ ompact WY representation of Q. */ } - return 0; + return; /* End of ZGEQRT3 */ diff --git a/lapack-netlib/SRC/zgerfs.c b/lapack-netlib/SRC/zgerfs.c index b2d308e583..a53dd4f8f7 100644 --- a/lapack-netlib/SRC/zgerfs.c +++ b/lapack-netlib/SRC/zgerfs.c @@ -699,7 +699,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgerfs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void zgerfs_(char *trans, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, @@ -718,7 +718,7 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3], count; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, @@ -792,7 +792,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGERFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -804,7 +804,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1030,7 +1030,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZGERFS */ diff --git a/lapack-netlib/SRC/zgerq2.c b/lapack-netlib/SRC/zgerq2.c index 97805c94f8..ccafd6293a 100644 --- a/lapack-netlib/SRC/zgerq2.c +++ b/lapack-netlib/SRC/zgerq2.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgerq2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgerq2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *info) { /* System generated locals */ @@ -642,9 +642,11 @@ f"> */ /* Local variables */ integer i__, k; doublecomplex alpha; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, doublecomplex *, + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *); @@ -679,7 +681,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGERQ2", &i__1, (ftnlen)6); - return 0; + return; } k = f2cmin(*m,*n); @@ -710,7 +712,7 @@ f"> */ zlacgv_(&i__1, &a[*m - k + i__ + a_dim1], lda); /* L10: */ } - return 0; + return; /* End of ZGERQ2 */ diff --git a/lapack-netlib/SRC/zgerqf.c b/lapack-netlib/SRC/zgerqf.c index 8caf1ceadf..840416f507 100644 --- a/lapack-netlib/SRC/zgerqf.c +++ b/lapack-netlib/SRC/zgerqf.c @@ -654,7 +654,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgerqf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgerqf_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -663,18 +663,18 @@ f"> */ /* Local variables */ integer i__, k, nbmin, iinfo; - extern /* Subroutine */ int zgerq2_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerq2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer ib, nb, ki, kk, mu, nu, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -730,15 +730,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGERQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (k == 0) { - return 0; + return; } nbmin = 2; @@ -829,7 +829,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZGERQF */ diff --git a/lapack-netlib/SRC/zgesc2.c b/lapack-netlib/SRC/zgesc2.c index e880737661..ae7884dd77 100644 --- a/lapack-netlib/SRC/zgesc2.c +++ b/lapack-netlib/SRC/zgesc2.c @@ -631,7 +631,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int zgesc2_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zgesc2_(integer *n, doublecomplex *a, integer *lda, doublecomplex *rhs, integer *ipiv, integer *jpiv, doublereal *scale) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ /* Local variables */ doublecomplex temp; integer i__, j; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); doublereal bignum; @@ -746,7 +746,7 @@ f"> */ i__1 = *n - 1; zlaswp_(&c__1, &rhs[1], lda, &c__1, &i__1, &jpiv[1], &c_n1); - return 0; + return; /* End of ZGESC2 */ diff --git a/lapack-netlib/SRC/zgesdd.c b/lapack-netlib/SRC/zgesdd.c index b081ed4824..898d5c87ac 100644 --- a/lapack-netlib/SRC/zgesdd.c +++ b/lapack-netlib/SRC/zgesdd.c @@ -742,7 +742,7 @@ f"> */ /* > California at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgesdd_(char *jobz, integer *m, integer *n, +/* Subroutine */ void zgesdd_(char *jobz, integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublecomplex *u, integer *ldu, doublecomplex *vt, integer *ldvt, doublecomplex *work, integer *lwork, doublereal *rwork, integer *iwork, integer *info) @@ -764,7 +764,7 @@ f"> */ lwork_zunmbr_qln_nn__, i__; extern logical lsame_(char *, char *); integer chunk, minmn; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -772,29 +772,29 @@ f"> */ logical wntqa; integer nwork; logical wntqn, wntqo, wntqs; - extern /* Subroutine */ int zlacp2_(char *, integer *, integer *, + extern /* Subroutine */ void zlacp2_(char *, integer *, integer *, doublereal *, integer *, doublecomplex *, integer *); integer mnthr1, mnthr2, ie; - extern /* Subroutine */ int dbdsdc_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dbdsdc_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer il; extern doublereal dlamch_(char *); integer ir, iu; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer lwork_zungbr_p_mn__, lwork_zungbr_p_nn__, lwork_zungbr_q_mn__, lwork_zungbr_q_mm__; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int zgebrd_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgebrd_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, integer *); extern logical disnan_(doublereal *); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlacrm_(integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, doublecomplex *, integer *, doublereal *) @@ -806,18 +806,18 @@ f"> */ integer *, doublecomplex *, doublecomplex *, integer *, integer * ); integer ldwrkl; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer ldwrkr, minwrk, ldwrku, maxwrk; - extern /* Subroutine */ int zungbr_(char *, integer *, integer *, integer + extern /* Subroutine */ void zungbr_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer ldwkvt; doublereal smlnum; logical wntqas; - extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmbr_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zunglq_(integer *, integer *, integer * @@ -825,7 +825,7 @@ f"> */ integer *, integer *); logical lquery; integer nrwork; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer blk; @@ -1285,15 +1285,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGESDD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1307,7 +1307,7 @@ f"> */ anrm = zlange_("M", m, n, &a[a_offset], lda, dum); if (disnan_(&anrm)) { *info = -4; - return 0; + return; } iscl = 0; if (anrm > 0. && anrm < smlnum) { @@ -2969,7 +2969,7 @@ f"> */ work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGESDD */ diff --git a/lapack-netlib/SRC/zgesv.c b/lapack-netlib/SRC/zgesv.c index ead824bc1c..53813cd015 100644 --- a/lapack-netlib/SRC/zgesv.c +++ b/lapack-netlib/SRC/zgesv.c @@ -640,7 +640,8 @@ iver) */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgetrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgetrf_( integer *, integer *, doublecomplex *, integer *, integer *, integer *), zgetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); diff --git a/lapack-netlib/SRC/zgesvd.c b/lapack-netlib/SRC/zgesvd.c index 45769457f1..0d16703571 100644 --- a/lapack-netlib/SRC/zgesvd.c +++ b/lapack-netlib/SRC/zgesvd.c @@ -732,7 +732,7 @@ f"> */ /* > \ingroup complex16GEsing */ /* ===================================================================== */ -/* Subroutine */ int zgesvd_(char *jobu, char *jobvt, integer *m, integer *n, +/* Subroutine */ void zgesvd_(char *jobu, char *jobvt, integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublecomplex *u, integer *ldu, doublecomplex *vt, integer *ldvt, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) @@ -751,7 +751,7 @@ f"> */ lwork_zgeqrf__; extern logical lsame_(char *, char *); integer chunk, minmn; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -760,10 +760,11 @@ f"> */ integer ie; extern doublereal dlamch_(char *); integer ir, iu; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, - integer *, integer *), xerbla_(char *, integer *, ftnlen), - zgebrd_(integer *, integer *, doublecomplex *, integer *, + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zgebrd_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, @@ -771,7 +772,7 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *, @@ -781,24 +782,24 @@ f"> */ integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer ldwrkr; - extern /* Subroutine */ int zbdsqr_(char *, integer *, integer *, integer + extern /* Subroutine */ void zbdsqr_(char *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *); integer minwrk, ldwrku, maxwrk; - extern /* Subroutine */ int zungbr_(char *, integer *, integer *, integer + extern /* Subroutine */ void zungbr_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal smlnum; integer irwork; - extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmbr_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zunglq_(integer *, integer *, integer * , doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); logical lquery, wntuas, wntvas; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer blk, lwork_zungbr_p__, lwork_zungbr_q__, ncu; @@ -1338,15 +1339,15 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("ZGESVD", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Get machine constants */ @@ -4722,7 +4723,7 @@ f"> */ work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGESVD */ diff --git a/lapack-netlib/SRC/zgesvdq.c b/lapack-netlib/SRC/zgesvdq.c index 21855194a7..ea91ef2697 100644 --- a/lapack-netlib/SRC/zgesvdq.c +++ b/lapack-netlib/SRC/zgesvdq.c @@ -932,7 +932,7 @@ static logical c_false = FALSE_; /* > \ingroup complex16GEsing */ /* ===================================================================== */ -/* Subroutine */ int zgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, +/* Subroutine */ void zgesvdq_(char *joba, char *jobp, char *jobr, char *jobu, char *jobv, integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublecomplex *u, integer *ldu, doublecomplex *v, integer *ldv, integer *numrank, integer *iwork, integer *liwork, @@ -967,30 +967,31 @@ static logical c_false = FALSE_; logical dntwu, dntwv, wntuf, wntva; integer lwunq; logical wntur, wntus, wntvr; - extern /* Subroutine */ int zgeqp3_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqp3_(integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, doublecomplex *, integer * , doublereal *, integer *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); integer lwsvd2, lwunq2; extern doublereal dlamch_(char *); integer nr; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); doublereal sconda; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *, ftnlen), zdscal_(integer *, doublereal + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); doublecomplex cdummy[1]; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zgesvd_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, @@ -1001,7 +1002,7 @@ static logical c_false = FALSE_; doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer minwrk; logical rtrans; - extern /* Subroutine */ int zlapmt_(logical *, integer *, integer *, + extern /* Subroutine */ void zlapmt_(logical *, integer *, integer *, doublecomplex *, integer *, integer *), zpocon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *); @@ -1012,7 +1013,7 @@ static logical c_false = FALSE_; integer *, integer *, integer *, integer *); integer optwrk; logical rowprm; - extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -1386,7 +1387,7 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); xerbla_("ZGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { /* Return optimal workspace */ @@ -1395,13 +1396,13 @@ static logical c_false = FALSE_; cwork[1].r = (doublereal) optwrk, cwork[1].i = 0.; cwork[2].r = (doublereal) minwrk, cwork[2].i = 0.; rwork[1] = (doublereal) rminwrk; - return 0; + return; } /* Quick return if the matrix is void. */ if (*m == 0 || *n == 0) { - return 0; + return; } big = dlamch_("O"); @@ -1418,7 +1419,7 @@ static logical c_false = FALSE_; *info = -8; i__2 = -(*info); xerbla_("ZGESVDQ", &i__2, (ftnlen)7); - return 0; + return; } /* L1904: */ } @@ -1472,7 +1473,7 @@ static logical c_false = FALSE_; rwork[1] = -1.; } rwork[2] = -1.; - return 0; + return; } if (rwork[1] > big / sqrt((doublereal) (*m))) { @@ -1496,7 +1497,7 @@ static logical c_false = FALSE_; *info = -8; i__1 = -(*info); xerbla_("ZGESVDQ", &i__1, (ftnlen)7); - return 0; + return; } if (rtmp > big / sqrt((doublereal) (*m))) { /* matrix by 1/sqrt(M) if too large entry detected */ @@ -2323,7 +2324,7 @@ static logical c_false = FALSE_; /* full row rank triangular (trapezoidal) factor of A. */ *numrank = nr; - return 0; + return; /* End of ZGESVDQ */ diff --git a/lapack-netlib/SRC/zgesvdx.c b/lapack-netlib/SRC/zgesvdx.c index c53117a69d..dfd525a95f 100644 --- a/lapack-netlib/SRC/zgesvdx.c +++ b/lapack-netlib/SRC/zgesvdx.c @@ -786,7 +786,7 @@ static integer c_n1 = -1; /* > \ingroup complex16GEsing */ /* ===================================================================== */ -/* Subroutine */ int zgesvdx_(char *jobu, char *jobvt, char *range, integer * +/* Subroutine */ void zgesvdx_(char *jobu, char *jobvt, char *range, integer * m, integer *n, doublecomplex *a, integer *lda, doublereal *vl, doublereal *vu, integer *il, integer *iu, integer *ns, doublereal *s, doublecomplex *u, integer *ldu, doublecomplex *vt, integer *ldvt, @@ -815,43 +815,44 @@ static integer c_n1 = -1; logical wantu; integer id, ie; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), zgebrd_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum, abstol; - extern /* Subroutine */ int zgelqf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); char rngtgk[1]; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); integer itempr; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer minwrk, maxwrk; doublereal smlnum; - extern /* Subroutine */ int zunmbr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmbr_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ); logical lquery, wantvt; - extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal dum[1], eps; - extern /* Subroutine */ int dbdsvdx_(char *, char *, char *, integer *, + extern /* Subroutine */ void dbdsvdx_(char *, char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *); @@ -1052,15 +1053,15 @@ static integer c_n1 = -1; if (*info != 0) { i__2 = -(*info); xerbla_("ZGESVDX", &i__2, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set singular values indices accord to RANGE='A'. */ @@ -1504,7 +1505,7 @@ static integer c_n1 = -1; z__1.r = d__1, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGESVDX */ diff --git a/lapack-netlib/SRC/zgesvj.c b/lapack-netlib/SRC/zgesvj.c index b352266631..b94112af01 100644 --- a/lapack-netlib/SRC/zgesvj.c +++ b/lapack-netlib/SRC/zgesvj.c @@ -868,7 +868,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgesvj_(char *joba, char *jobu, char *jobv, integer *m, +/* Subroutine */ void zgesvj_(char *joba, char *jobu, char *jobv, integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *sva, integer * mv, doublecomplex *v, integer *ldv, doublecomplex *cwork, integer * lwork, doublereal *rwork, integer *lrwork, integer *info) @@ -887,7 +887,7 @@ f"> */ doublereal bigtheta; doublecomplex ompq; integer pskipped; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublereal aapp0, aapq1, temp1; integer i__, p, q; @@ -901,14 +901,14 @@ f"> */ doublecomplex *, integer *, doublecomplex *, integer *); logical lower, upper, rotok; integer n2, n4; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal rootsfmin; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgsvj0_(char *, integer *, integer *, + extern /* Subroutine */ void zgsvj0_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zgsvj1_(char *, integer *, integer *, integer *, @@ -919,25 +919,25 @@ f"> */ doublereal cs; extern doublereal dlamch_(char *); doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); integer blskip; doublereal mxaapq; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal thsign; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal mxsinj; integer ir1; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); integer emptsw; logical lquery; @@ -1021,18 +1021,18 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGESVJ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { i__1 = *m + *n; cwork[1].r = (doublereal) i__1, cwork[1].i = 0.; rwork[1] = (doublereal) f2cmax(*n,6); - return 0; + return; } /* #:) Quick return for void matrix */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Set numerical parameters */ @@ -1074,7 +1074,7 @@ f"> */ *info = -4; i__1 = -(*info); xerbla_("ZGESVJ", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize the right singular vector matrix. */ @@ -1112,7 +1112,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("ZGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1142,7 +1142,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("ZGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1172,7 +1172,7 @@ f"> */ *info = -6; i__2 = -(*info); xerbla_("ZGESVJ", &i__2, (ftnlen)6); - return 0; + return; } aaqq = sqrt(aaqq); if (aapp < big / aaqq && noscale) { @@ -1228,7 +1228,7 @@ f"> */ rwork[4] = 0.; rwork[5] = 0.; rwork[6] = 0.; - return 0; + return; } /* #:) Quick return for one-column matrix */ @@ -1248,7 +1248,7 @@ f"> */ rwork[4] = 0.; rwork[5] = 0.; rwork[6] = 0.; - return 0; + return; } /* Protect small singular values from underflow, and try to */ @@ -2272,6 +2272,6 @@ f"> */ /* MXSINJ is the largest absolute value of the sines of Jacobi angles */ /* in the last sweep */ - return 0; + return; } /* zgesvj_ */ diff --git a/lapack-netlib/SRC/zgesvx.c b/lapack-netlib/SRC/zgesvx.c index 58ff9f6c1c..7a8538c351 100644 --- a/lapack-netlib/SRC/zgesvx.c +++ b/lapack-netlib/SRC/zgesvx.c @@ -857,7 +857,7 @@ f"> */ /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zgesvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void zgesvx_(char *fact, char *trans, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, @@ -884,7 +884,7 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zlaqge_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqge_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal * , doublereal *, char *), zgecon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, @@ -892,16 +892,18 @@ f"> */ integer infequ; logical colequ; doublereal rowcnd; - extern /* Subroutine */ int zgeequ_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeequ_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal * , doublereal *, integer *); logical notran; - extern /* Subroutine */ int zgerfs_(char *, integer *, integer *, + extern /* Subroutine */ void zgerfs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, - integer *), zgetrf_(integer *, integer *, doublecomplex *, - integer *, integer *, integer *), zlacpy_(char *, integer *, + integer *); + extern int zgetrf_(integer *, integer *, doublecomplex *, + integer *, integer *, integer *); + extern void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal zlantr_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *); @@ -1034,7 +1036,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGESVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1115,7 +1117,7 @@ f"> */ } rwork[1] = rpvgrw; *rcond = 0.; - return 0; + return; } } @@ -1208,7 +1210,7 @@ f"> */ } rwork[1] = rpvgrw; - return 0; + return; /* End of ZGESVX */ diff --git a/lapack-netlib/SRC/zgesvxx.c b/lapack-netlib/SRC/zgesvxx.c index 9a9e875b4f..ed19f0d395 100644 --- a/lapack-netlib/SRC/zgesvxx.c +++ b/lapack-netlib/SRC/zgesvxx.c @@ -1045,7 +1045,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zgesvxx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void zgesvxx_(char *fact, char *trans, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, integer *ipiv, char *equed, doublereal *r__, doublereal *c__, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, @@ -1073,22 +1073,22 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical nofact; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int zlaqge_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqge_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal * , doublereal *, char *); integer infequ; logical colequ; doublereal rowcnd; logical notran; - extern /* Subroutine */ int zgetrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgetrf_(integer *, integer *, doublecomplex *, integer *, integer *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zgetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); logical rowequ; - extern /* Subroutine */ int zlascl2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void zlascl2_(integer *, integer *, doublereal *, doublecomplex *, integer *), zgeequb_(integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), zgerfsx_( @@ -1233,7 +1233,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGESVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1299,7 +1299,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = zla_gerpvgrw_(n, info, &a[a_offset], lda, &af[ af_offset], ldaf); - return 0; + return; } } @@ -1330,7 +1330,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zlascl2_(n, nrhs, &r__[1], &x[x_offset], ldx); } - return 0; + return; /* End of ZGESVXX */ diff --git a/lapack-netlib/SRC/zgetc2.c b/lapack-netlib/SRC/zgetc2.c index 0d024477a4..80ff7ab604 100644 --- a/lapack-netlib/SRC/zgetc2.c +++ b/lapack-netlib/SRC/zgetc2.c @@ -626,7 +626,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int zgetc2_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zgetc2_(integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *jpiv, integer *info) { /* System generated locals */ @@ -637,7 +637,7 @@ f"> */ /* Local variables */ doublereal smin, xmax; integer i__, j; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, @@ -670,7 +670,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -691,7 +691,7 @@ f"> */ z__1.r = smlnum, z__1.i = 0.; a[i__1].r = z__1.r, a[i__1].i = z__1.i; } - return 0; + return; } /* Factorize A using complete pivoting. */ @@ -772,7 +772,7 @@ f"> */ ipiv[*n] = *n; jpiv[*n] = *n; - return 0; + return; /* End of ZGETC2 */ diff --git a/lapack-netlib/SRC/zgetf2.c b/lapack-netlib/SRC/zgetf2.c index 1609b5d025..fdc8954288 100644 --- a/lapack-netlib/SRC/zgetf2.c +++ b/lapack-netlib/SRC/zgetf2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgetf2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgetf2_(integer *m, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -633,7 +633,7 @@ f"> */ /* Local variables */ integer i__, j; doublereal sfmin; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, @@ -673,13 +673,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGETF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Compute machine safe minimum */ @@ -740,7 +740,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of ZGETF2 */ diff --git a/lapack-netlib/SRC/zgetrf.c b/lapack-netlib/SRC/zgetrf.c index 42dcee4304..d4f52efd3b 100644 --- a/lapack-netlib/SRC/zgetrf.c +++ b/lapack-netlib/SRC/zgetrf.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgetrf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgetrf_(integer *m, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -632,7 +632,7 @@ f"> */ /* Local variables */ integer i__, j, iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, @@ -642,7 +642,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *), zgetrf2_(integer *, integer *, doublecomplex *, integer *, integer *, integer *); @@ -676,13 +676,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -761,7 +761,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of ZGETRF */ diff --git a/lapack-netlib/SRC/zgetrf2.c b/lapack-netlib/SRC/zgetrf2.c index 44e1df00b7..679b10374e 100644 --- a/lapack-netlib/SRC/zgetrf2.c +++ b/lapack-netlib/SRC/zgetrf2.c @@ -625,7 +625,7 @@ static integer c__1 = 1; /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgetrf2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zgetrf2_(integer *m, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -636,13 +636,13 @@ static integer c__1 = 1; doublecomplex temp; integer i__, iinfo; doublereal sfmin; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer n1, n2; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); @@ -681,13 +681,13 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (*m == 1) { @@ -807,7 +807,7 @@ static integer c__1 = 1; zlaswp_(&n1, &a[a_dim1 + 1], lda, &i__1, &i__2, &ipiv[1], &c__1); } - return 0; + return; /* End of ZGETRF2 */ diff --git a/lapack-netlib/SRC/zgetri.c b/lapack-netlib/SRC/zgetri.c index fb0a721bd3..e7a46e719a 100644 --- a/lapack-netlib/SRC/zgetri.c +++ b/lapack-netlib/SRC/zgetri.c @@ -630,7 +630,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgetri_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zgetri_(integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { /* System generated locals */ @@ -639,7 +639,7 @@ f"> */ /* Local variables */ integer i__, j, nbmin; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, @@ -695,15 +695,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRI", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form inv(U). If INFO > 0 from ZTRTRI, then U is singular, */ @@ -711,7 +711,7 @@ f"> */ ztrtri_("Upper", "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } nbmin = 2; @@ -816,7 +816,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZGETRI */ diff --git a/lapack-netlib/SRC/zgetrs.c b/lapack-netlib/SRC/zgetrs.c index d81cf4dc2e..d9e10c3460 100644 --- a/lapack-netlib/SRC/zgetrs.c +++ b/lapack-netlib/SRC/zgetrs.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgetrs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void zgetrs_(char *trans, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -645,12 +645,12 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical notran; - extern /* Subroutine */ int zlaswp_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); @@ -692,13 +692,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGETRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (notran) { @@ -737,7 +737,7 @@ f"> */ zlaswp_(nrhs, &b[b_offset], ldb, &c__1, n, &ipiv[1], &c_n1); } - return 0; + return; /* End of ZGETRS */ diff --git a/lapack-netlib/SRC/zgetsls.c b/lapack-netlib/SRC/zgetsls.c index 9d2e9e6d53..a63df472f1 100644 --- a/lapack-netlib/SRC/zgetsls.c +++ b/lapack-netlib/SRC/zgetsls.c @@ -674,7 +674,7 @@ static integer c__0 = 0; /* > \ingroup complex16GEsolve */ /* ===================================================================== */ -/* Subroutine */ int zgetsls_(char *trans, integer *m, integer *n, integer * +/* Subroutine */ void zgetsls_(char *trans, integer *m, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -689,13 +689,13 @@ static integer c__0 = 0; integer brow, tszm, tszo, info2, i__, j, iascl, ibscl; extern logical lsame_(char *, char *); integer minmn, maxmn; - extern /* Subroutine */ int zgelq_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgelq_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zgeqr_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublecomplex workq[1]; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); doublecomplex tq[5]; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -703,7 +703,7 @@ static integer c__0 = 0; doublereal bignum; extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zgemlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex * @@ -826,7 +826,7 @@ static integer c__0 = 0; xerbla_("ZGETSLS", &i__1, (ftnlen)7); d__1 = (doublereal) wsizeo; work[1].r = d__1, work[1].i = 0.; - return 0; + return; } if (lquery) { if (*lwork == -1) { @@ -837,7 +837,7 @@ static integer c__0 = 0; r__1 = (real) wsizem; work[1].r = r__1, work[1].i = 0.f; } - return 0; + return; } if (*lwork < wsizeo) { lw1 = tszm; @@ -854,7 +854,7 @@ static integer c__0 = 0; if (f2cmin(i__1,*nrhs) == 0) { i__1 = f2cmax(*m,*n); zlaset_("FULL", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); - return 0; + return; } /* Get machine parameters */ @@ -932,7 +932,7 @@ static integer c__0 = 0; ztrtrs_("U", "N", "N", n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); if (*info > 0) { - return 0; + return; } scllen = *n; } else { @@ -945,7 +945,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(N+1:M,1:NRHS) = CZERO */ @@ -989,7 +989,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } /* B(M+1:N,1:NRHS) = 0 */ @@ -1031,7 +1031,7 @@ static integer c__0 = 0; ldb, info); if (*info > 0) { - return 0; + return; } scllen = *m; @@ -1060,7 +1060,7 @@ static integer c__0 = 0; L50: d__1 = (doublereal) (tszo + lwo); work[1].r = d__1, work[1].i = 0.; - return 0; + return; /* End of ZGETSLS */ diff --git a/lapack-netlib/SRC/zgetsqrhrt.c b/lapack-netlib/SRC/zgetsqrhrt.c index 093ee20ec2..6b426465fb 100644 --- a/lapack-netlib/SRC/zgetsqrhrt.c +++ b/lapack-netlib/SRC/zgetsqrhrt.c @@ -689,7 +689,7 @@ hrt.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgetsqrhrt_(integer *m, integer *n, integer *mb1, +/* Subroutine */ void zgetsqrhrt_(integer *m, integer *n, integer *mb1, integer *nb1, integer *nb2, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *lwork, integer *info) @@ -701,16 +701,17 @@ hrt.f"> */ /* Local variables */ integer ldwt, lworkopt, i__, j, iinfo; - extern /* Subroutine */ int zungtsqr_row_(integer *, integer *, integer * + extern /* Subroutine */ void zungtsqr_row_(integer *, integer *, integer * , integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zunhr_col_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *) - , xerbla_(char *, integer *, ftnlen); + ; + extern int xerbla_(char *, integer *, ftnlen); logical lquery; integer lw1, lw2, num_all_row_blocks__, lwt, nb1local, nb2local; - extern /* Subroutine */ int zlatsqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zlatsqr_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -812,11 +813,11 @@ hrt.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGETSQRHRT", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* Quick return if possible */ @@ -824,7 +825,7 @@ hrt.f"> */ if (f2cmin(*m,*n) == 0) { z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } nb2local = f2cmin(*nb2,*n); @@ -895,7 +896,7 @@ hrt.f"> */ z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGETSQRHRT */ diff --git a/lapack-netlib/SRC/zggbak.c b/lapack-netlib/SRC/zggbak.c index 091fa1d008..735ae22f99 100644 --- a/lapack-netlib/SRC/zggbak.c +++ b/lapack-netlib/SRC/zggbak.c @@ -656,7 +656,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggbak_(char *job, char *side, integer *n, integer *ilo, +/* Subroutine */ void zggbak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, doublereal *lscale, doublereal *rscale, integer *m, doublecomplex *v, integer *ldv, integer *info) { @@ -667,9 +667,10 @@ f"> */ integer i__, k; extern logical lsame_(char *, char *); logical leftv; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, - doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen), - zdscal_(integer *, doublereal *, doublecomplex *, integer *); + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zdscal_(integer *, doublereal *, doublecomplex *, integer *); logical rightv; @@ -719,19 +720,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGBAK", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*m == 0) { - return 0; + return; } if (lsame_(job, "N")) { - return 0; + return; } if (*ilo == *ihi) { @@ -836,7 +837,7 @@ f"> */ L110: - return 0; + return; /* End of ZGGBAK */ diff --git a/lapack-netlib/SRC/zggbal.c b/lapack-netlib/SRC/zggbal.c index 4d18110855..673ad9d14a 100644 --- a/lapack-netlib/SRC/zggbal.c +++ b/lapack-netlib/SRC/zggbal.c @@ -691,7 +691,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggbal_(char *job, integer *n, doublecomplex *a, integer +/* Subroutine */ void zggbal_(char *job, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *ilo, integer *ihi, doublereal *lscale, doublereal *rscale, doublereal *work, integer * info) @@ -710,15 +710,15 @@ f"> */ doublereal coef2, coef5; integer i__, j, k, l, m; doublereal gamma, t, alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal sfmin, sfmax; integer iflow; - extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void daxpy_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer kount; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jc; doublereal ta, tb, tc; @@ -727,7 +727,8 @@ f"> */ doublereal ew; integer nr; doublereal pgamma; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); integer lsfmin; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -773,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGBAL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -781,7 +782,7 @@ f"> */ if (*n == 0) { *ilo = 1; *ihi = *n; - return 0; + return; } if (*n == 1) { @@ -789,7 +790,7 @@ f"> */ *ihi = *n; lscale[1] = 1.; rscale[1] = 1.; - return 0; + return; } if (lsame_(job, "N")) { @@ -801,7 +802,7 @@ f"> */ rscale[i__] = 1.; /* L10: */ } - return 0; + return; } k = 1; @@ -946,11 +947,11 @@ f"> */ rscale[i__] = 1.; /* L195: */ } - return 0; + return; } if (*ilo == *ihi) { - return 0; + return; } /* Balance the submatrix in rows ILO to IHI. */ @@ -1213,7 +1214,7 @@ f"> */ /* L380: */ } - return 0; + return; /* End of ZGGBAL */ diff --git a/lapack-netlib/SRC/zgges.c b/lapack-netlib/SRC/zgges.c index 2ac9479a1e..e8858d7e12 100644 --- a/lapack-netlib/SRC/zgges.c +++ b/lapack-netlib/SRC/zgges.c @@ -784,7 +784,7 @@ or GE matrices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgges_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex * beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer @@ -804,9 +804,9 @@ or GE matrices */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irwrk, irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * @@ -819,21 +819,21 @@ or GE matrices */ integer *, doublereal *); doublereal bignum; integer ijobvl, iright; - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal anrmto; integer lwkmin; logical lastsl; doublereal bnrmto; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zhgeqz_( @@ -850,7 +850,7 @@ or GE matrices */ doublereal smlnum; logical wantst, lquery; integer lwkopt; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -972,16 +972,16 @@ or GE matrices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGES ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1193,7 +1193,7 @@ or GE matrices */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZGGES */ diff --git a/lapack-netlib/SRC/zgges3.c b/lapack-netlib/SRC/zgges3.c index 3c1103761e..ac9ce770fa 100644 --- a/lapack-netlib/SRC/zgges3.c +++ b/lapack-netlib/SRC/zgges3.c @@ -783,7 +783,7 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex * beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer @@ -804,13 +804,13 @@ f"> */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irwrk, irows; - extern /* Subroutine */ int zgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghd3_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), dlabad_( doublereal *, doublereal *); extern doublereal dlamch_(char *); - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * @@ -821,16 +821,16 @@ f"> */ integer *, doublereal *); doublereal bignum; integer ijobvl, iright; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal anrmto, bnrmto; logical lastsl; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zhgeqz_( @@ -847,7 +847,7 @@ f"> */ doublereal smlnum; logical wantst, lquery; integer lwkopt; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -987,16 +987,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGES3 ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1202,7 +1202,7 @@ f"> */ z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGGES3 */ diff --git a/lapack-netlib/SRC/zggesx.c b/lapack-netlib/SRC/zggesx.c index a735cf09ee..ea98b0e1ab 100644 --- a/lapack-netlib/SRC/zggesx.c +++ b/lapack-netlib/SRC/zggesx.c @@ -843,7 +843,7 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp +/* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp selctg, char *sense, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vsl, integer *ldvsl, @@ -863,10 +863,10 @@ f"> */ integer ileft, icols; logical cursl, ilvsl, ilvsr; integer irwrk, irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); doublereal pl, pr; - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * @@ -879,7 +879,7 @@ f"> */ integer *, doublereal *); doublereal bignum; integer ijobvl, iright; - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, @@ -890,14 +890,14 @@ f"> */ integer liwmin; logical wantse, lastsl; doublereal anrmto, bnrmto; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); integer maxwrk; logical wantsn; integer minwrk; doublereal smlnum; - extern /* Subroutine */ int zhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void zhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -906,7 +906,7 @@ f"> */ , integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); logical wantst, lquery, wantsv; - extern /* Subroutine */ int ztgsen_(integer *, logical *, logical *, + extern /* Subroutine */ void ztgsen_(integer *, logical *, logical *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublereal *, @@ -1068,16 +1068,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGESX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *sdim = 0; - return 0; + return; } /* Get machine constants */ @@ -1315,7 +1315,7 @@ f"> */ work[1].r = (doublereal) maxwrk, work[1].i = 0.; iwork[1] = liwmin; - return 0; + return; /* End of ZGGESX */ diff --git a/lapack-netlib/SRC/zggev.c b/lapack-netlib/SRC/zggev.c index 56c6679466..a93883a730 100644 --- a/lapack-netlib/SRC/zggev.c +++ b/lapack-netlib/SRC/zggev.c @@ -734,7 +734,7 @@ ices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zggev_(char *jobvl, char *jobvr, integer *n, +/* Subroutine */ void zggev_(char *jobvl, char *jobvr, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, doublecomplex *work, integer @@ -754,11 +754,11 @@ ices */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irwrk, irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer jc, in; extern doublereal dlamch_(char *); integer jr; - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * @@ -773,20 +773,20 @@ ices */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); integer ijobvl, iright; - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal anrmto; integer lwkmin; doublereal bnrmto; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), ztgevc_( @@ -801,7 +801,7 @@ ices */ doublereal smlnum; integer lwkopt; logical lquery; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -920,15 +920,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1171,7 +1171,7 @@ ices */ } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZGGEV */ diff --git a/lapack-netlib/SRC/zggev3.c b/lapack-netlib/SRC/zggev3.c index 6df791b846..084d205176 100644 --- a/lapack-netlib/SRC/zggev3.c +++ b/lapack-netlib/SRC/zggev3.c @@ -733,7 +733,7 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ int zggev3_(char *jobvl, char *jobvr, integer *n, +/* Subroutine */ void zggev3_(char *jobvl, char *jobvr, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, doublecomplex *work, integer @@ -753,7 +753,7 @@ f"> */ integer iwrk; extern logical lsame_(char *, char *); integer ileft, icols, irwrk, irows; - extern /* Subroutine */ int zgghd3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghd3_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), dlabad_( @@ -761,7 +761,7 @@ f"> */ integer jc, in; extern doublereal dlamch_(char *); integer jr; - extern /* Subroutine */ int zggbak_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublecomplex *, integer *, integer *), zggbal_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * @@ -774,15 +774,15 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); integer ijobvl, iright; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer ijobvr; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal anrmto, bnrmto; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), ztgevc_( @@ -797,7 +797,7 @@ f"> */ doublereal smlnum; integer lwkopt; logical lquery; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -939,15 +939,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGEV3 ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1185,7 +1185,7 @@ f"> */ z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGGEV3 */ diff --git a/lapack-netlib/SRC/zggevx.c b/lapack-netlib/SRC/zggevx.c index 7c66fb7dbd..b04cf7fd6b 100644 --- a/lapack-netlib/SRC/zggevx.c +++ b/lapack-netlib/SRC/zggevx.c @@ -888,7 +888,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggevx_(char *balanc, char *jobvl, char *jobvr, char * +/* Subroutine */ void zggevx_(char *balanc, char *jobvl, char *jobvr, char * sense, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, @@ -913,11 +913,11 @@ f"> */ integer icols; logical noscl; integer irows; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); integer jc, in; extern doublereal dlamch_(char *); integer jr; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), zggbak_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, @@ -935,7 +935,7 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); integer ijobvl; - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlascl_(char *, integer *, integer *, @@ -943,13 +943,13 @@ f"> */ integer *, integer *); integer ijobvr; logical wantsb; - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ); doublereal anrmto; logical wantse; doublereal bnrmto; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), ztgevc_( @@ -962,7 +962,7 @@ f"> */ *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *, integer *); integer minwrk; - extern /* Subroutine */ int zhgeqz_(char *, char *, char *, integer *, + extern /* Subroutine */ void zhgeqz_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -971,7 +971,7 @@ f"> */ logical wantsn; doublereal smlnum; logical lquery, wantsv; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -1115,15 +1115,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1431,7 +1431,7 @@ f"> */ } work[1].r = (doublereal) maxwrk, work[1].i = 0.; - return 0; + return; /* End of ZGGEVX */ diff --git a/lapack-netlib/SRC/zggglm.c b/lapack-netlib/SRC/zggglm.c index 87e2a9af4e..cc2dcd8fd3 100644 --- a/lapack-netlib/SRC/zggglm.c +++ b/lapack-netlib/SRC/zggglm.c @@ -699,7 +699,7 @@ f"> */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zggglm_(integer *n, integer *m, integer *p, +/* Subroutine */ void zggglm_(integer *n, integer *m, integer *p, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *d__, doublecomplex *x, doublecomplex *y, doublecomplex *work, integer *lwork, integer *info) @@ -710,7 +710,7 @@ f"> */ /* Local variables */ integer lopt, i__; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, @@ -719,17 +719,18 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zggqrf_(integer *, integer *, integer *, + extern /* Subroutine */ void zggqrf_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *) ; integer lwkmin, nb1, nb2, nb3, nb4, lwkopt; logical lquery; - extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmrq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, integer *), ztrtrs_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *); + extern int ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -804,9 +805,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGGLM", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -822,7 +823,7 @@ f"> */ i__2 = i__; y[i__2].r = 0., y[i__2].i = 0.; } - return 0; + return; } /* Compute the GQR factorization of matrices A and B: */ @@ -863,7 +864,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } i__1 = *n - *m; @@ -894,7 +895,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Copy D to X */ @@ -917,7 +918,7 @@ f"> */ i__1 = *m + np + f2cmax(i__2,i__3); work[1].r = (doublereal) i__1, work[1].i = 0.; - return 0; + return; /* End of ZGGGLM */ diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index 6c24131aa3..62b4acdec3 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -289,7 +289,7 @@ SUBROUTINE ZGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK, * CALL ZGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ), $ WORK( M+NP+1 ), LWORK-M-NP, INFO ) - LOPT = DBLE( WORK( M+NP+1 ) ) + LOPT = INT( WORK( M+NP+1 ) ) * * Update left-hand-side vector d = Q**H*d = ( d1 ) M * ( d2 ) N-M diff --git a/lapack-netlib/SRC/zgghd3.c b/lapack-netlib/SRC/zgghd3.c index 0347a460f5..ff20280f30 100644 --- a/lapack-netlib/SRC/zgghd3.c +++ b/lapack-netlib/SRC/zgghd3.c @@ -745,7 +745,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgghd3_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void zgghd3_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *q, integer *ldq, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, integer *info) @@ -760,7 +760,7 @@ f"> */ integer cola, jcol, ierr; doublecomplex temp; integer jrow, topq, ppwo; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublecomplex temp1, temp2, temp3; doublereal c__; @@ -769,33 +769,33 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin; doublecomplex ctemp; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer nblst; logical initq; doublecomplex c1, c2; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical wantq; integer j0; logical initz; - extern /* Subroutine */ int zunm22_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunm22_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *) ; logical wantz; doublecomplex s1, s2; - extern /* Subroutine */ int ztrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztrmv_(char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); char compq2[1], compz2[1]; integer nb, jj, nh, nx, pw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgghrd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgghrd_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ), zlaset_(char *, integer *, integer *, @@ -873,9 +873,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGHD3", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -900,7 +900,7 @@ f"> */ nh = *ihi - *ilo + 1; if (nh <= 1) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Determine the blocksize. */ @@ -1757,7 +1757,7 @@ f"> */ z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGGHD3 */ diff --git a/lapack-netlib/SRC/zgghrd.c b/lapack-netlib/SRC/zgghrd.c index 4321aaf2ff..62ba9a31f5 100644 --- a/lapack-netlib/SRC/zgghrd.c +++ b/lapack-netlib/SRC/zgghrd.c @@ -718,7 +718,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zgghrd_(char *compq, char *compz, integer *n, integer * +/* Subroutine */ void zgghrd_(char *compq, char *compz, integer *n, integer * ilo, integer *ihi, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *q, integer *ldq, doublecomplex *z__, integer *ldz, integer *info) @@ -730,7 +730,7 @@ f"> */ /* Local variables */ integer jcol, jrow; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublereal c__; doublecomplex s; @@ -738,7 +738,7 @@ f"> */ doublecomplex ctemp; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer icompq, icompz; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); logical ilq, ilz; @@ -823,7 +823,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGHRD", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize Q and Z if desired. */ @@ -838,7 +838,7 @@ f"> */ /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } /* Zero out lower triangle of B */ @@ -904,7 +904,7 @@ f"> */ /* L40: */ } - return 0; + return; /* End of ZGGHRD */ diff --git a/lapack-netlib/SRC/zgglse.c b/lapack-netlib/SRC/zgglse.c index e0c3e3b5e3..47b30742c5 100644 --- a/lapack-netlib/SRC/zgglse.c +++ b/lapack-netlib/SRC/zgglse.c @@ -694,7 +694,7 @@ f"> */ /* > \ingroup complex16OTHERsolve */ /* ===================================================================== */ -/* Subroutine */ int zgglse_(integer *m, integer *n, integer *p, +/* Subroutine */ void zgglse_(integer *m, integer *n, integer *p, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *c__, doublecomplex *d__, doublecomplex *x, doublecomplex *work, integer *lwork, integer *info) @@ -705,7 +705,7 @@ f"> */ /* Local variables */ integer lopt; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, @@ -717,17 +717,17 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zggrqf_(integer *, integer *, integer *, + extern /* Subroutine */ void zggrqf_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *) ; integer lwkmin, nb1, nb2, nb3, nb4, lwkopt; logical lquery; - extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmrq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, integer *), ztrtrs_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *); extern int ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -802,15 +802,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGLSE", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the GRQ factorization of matrices B and A: */ @@ -848,7 +848,7 @@ f"> */ if (*info > 0) { *info = 1; - return 0; + return; } /* Put the solution in X */ @@ -873,7 +873,7 @@ f"> */ if (*info > 0) { *info = 2; - return 0; + return; } /* Put the solutions in X */ @@ -914,7 +914,7 @@ f"> */ i__1 = *p + mn + f2cmax(i__2,i__3); work[1].r = (doublereal) i__1, work[1].i = 0.; - return 0; + return; /* End of ZGGLSE */ diff --git a/lapack-netlib/SRC/zgglse.f b/lapack-netlib/SRC/zgglse.f index e5869a7d40..cc558bc407 100644 --- a/lapack-netlib/SRC/zgglse.f +++ b/lapack-netlib/SRC/zgglse.f @@ -276,7 +276,7 @@ SUBROUTINE ZGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK, * CALL ZGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ), $ WORK( P+MN+1 ), LWORK-P-MN, INFO ) - LOPT = DBLE( WORK( P+MN+1 ) ) + LOPT = INT( WORK( P+MN+1 ) ) * * Update c = Z**H *c = ( c1 ) N-P * ( c2 ) M+P-N diff --git a/lapack-netlib/SRC/zggqrf.c b/lapack-netlib/SRC/zggqrf.c index 7962a60eae..afe06bcfca 100644 --- a/lapack-netlib/SRC/zggqrf.c +++ b/lapack-netlib/SRC/zggqrf.c @@ -728,7 +728,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggqrf_(integer *n, integer *m, integer *p, +/* Subroutine */ void zggqrf_(integer *n, integer *m, integer *p, doublecomplex *a, integer *lda, doublecomplex *taua, doublecomplex *b, integer *ldb, doublecomplex *taub, doublecomplex *work, integer * lwork, integer *info) @@ -741,13 +741,13 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zgerqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer nb1, nb2, nb3, lwkopt; logical lquery; - extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -809,9 +809,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGQRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* QR factorization of N-by-M matrix A: A = Q*R */ @@ -836,7 +836,7 @@ f"> */ i__1 = f2cmax(i__2,i__3); work[1].r = (doublereal) i__1, work[1].i = 0.; - return 0; + return; /* End of ZGGQRF */ diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f index 93b1dc0fc6..0388b08743 100644 --- a/lapack-netlib/SRC/zggqrf.f +++ b/lapack-netlib/SRC/zggqrf.f @@ -276,7 +276,7 @@ SUBROUTINE ZGGQRF( N, M, P, A, LDA, TAUA, B, LDB, TAUB, WORK, * QR factorization of N-by-M matrix A: A = Q*R * CALL ZGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = DBLE( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := Q**H*B. * diff --git a/lapack-netlib/SRC/zggrqf.c b/lapack-netlib/SRC/zggrqf.c index 7c12e53871..789c1c856f 100644 --- a/lapack-netlib/SRC/zggrqf.c +++ b/lapack-netlib/SRC/zggrqf.c @@ -727,7 +727,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggrqf_(integer *m, integer *p, integer *n, +/* Subroutine */ void zggrqf_(integer *m, integer *p, integer *n, doublecomplex *a, integer *lda, doublecomplex *taua, doublecomplex *b, integer *ldb, doublecomplex *taub, doublecomplex *work, integer * lwork, integer *info) @@ -740,13 +740,13 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer * ), zgerqf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer nb1, nb2, nb3, lwkopt; logical lquery; - extern /* Subroutine */ int zunmrq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmrq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -808,9 +808,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGGRQF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* RQ factorization of M-by-N matrix A: A = R*Q */ @@ -837,7 +837,7 @@ f"> */ i__1 = f2cmax(i__2,i__3); work[1].r = (doublereal) i__1, work[1].i = 0.; - return 0; + return; /* End of ZGGRQF */ diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f index a2d4a9d553..be912c7726 100644 --- a/lapack-netlib/SRC/zggrqf.f +++ b/lapack-netlib/SRC/zggrqf.f @@ -275,7 +275,7 @@ SUBROUTINE ZGGRQF( M, P, N, A, LDA, TAUA, B, LDB, TAUB, WORK, * RQ factorization of M-by-N matrix A: A = R*Q * CALL ZGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO ) - LOPT = DBLE( WORK( 1 ) ) + LOPT = INT( WORK( 1 ) ) * * Update B := B*Q**H * diff --git a/lapack-netlib/SRC/zggsvd3.c b/lapack-netlib/SRC/zggsvd3.c index bae5f7532c..a8bc8de191 100644 --- a/lapack-netlib/SRC/zggsvd3.c +++ b/lapack-netlib/SRC/zggsvd3.c @@ -865,7 +865,7 @@ static integer c__1 = 1; /* > ZGGSVD3 replaces the deprecated subroutine ZGGSVD. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void zggsvd3_(char *jobu, char *jobv, char *jobq, integer *m, integer *n, integer *p, integer *k, integer *l, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *alpha, doublereal *beta, doublecomplex *u, integer *ldu, doublecomplex *v, @@ -885,14 +885,14 @@ static integer c__1 = 1; integer ncallmycycle, i__, j; extern logical lsame_(char *, char *); doublereal anorm, bnorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantq, wantu, wantv; extern doublereal dlamch_(char *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int ztgsja_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztgsja_(char *, char *, char *, integer *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublecomplex *, integer *, @@ -900,7 +900,7 @@ static integer c__1 = 1; doublecomplex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int zggsvp3_(char *, char *, char *, integer *, + extern /* Subroutine */ void zggsvp3_(char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -997,10 +997,10 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZGGSVD3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* Compute the Frobenius norm of matrices A and B */ @@ -1063,7 +1063,7 @@ static integer c__1 = 1; z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGGSVD3 */ diff --git a/lapack-netlib/SRC/zggsvp3.c b/lapack-netlib/SRC/zggsvp3.c index b639c03cc4..e78bbe74ea 100644 --- a/lapack-netlib/SRC/zggsvp3.c +++ b/lapack-netlib/SRC/zggsvp3.c @@ -791,7 +791,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void zggsvp3_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *tola, doublereal *tolb, integer *k, integer *l, doublecomplex *u, integer *ldu, doublecomplex *v, integer @@ -808,7 +808,7 @@ static integer c_n1 = -1; integer i__, j; extern logical lsame_(char *, char *); logical wantq, wantu, wantv; - extern /* Subroutine */ int zgeqp3_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeqp3_(integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, doublecomplex *, integer * , doublereal *, integer *), zgeqr2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -820,11 +820,13 @@ static integer c_n1 = -1; *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zunmr2_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen), zlacpy_(char *, + doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical forwrd; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlapmt_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); integer lwkopt; @@ -930,10 +932,10 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZGGSVP3", &i__1, (ftnlen)7); - return 0; + return; } if (lquery) { - return 0; + return; } /* QR with column pivoting of B: B*P = V*( S11 S12 ) */ @@ -1189,7 +1191,7 @@ static integer c_n1 = -1; z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZGGSVP3 */ diff --git a/lapack-netlib/SRC/zgsvj0.c b/lapack-netlib/SRC/zgsvj0.c index bbe87c27b2..c300fcd93d 100644 --- a/lapack-netlib/SRC/zgsvj0.c +++ b/lapack-netlib/SRC/zgsvj0.c @@ -732,7 +732,7 @@ f"> */ /* > drmac@math.hr. Thank you. */ /* ===================================================================== */ -/* Subroutine */ int zgsvj0_(char *jobv, integer *m, integer *n, +/* Subroutine */ void zgsvj0_(char *jobv, integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *d__, doublereal *sva, integer *mv, doublecomplex *v, integer *ldv, doublereal *eps, doublereal *sfmin, doublereal *tol, integer *nsweep, doublecomplex * @@ -752,7 +752,7 @@ f"> */ doublereal bigtheta; doublecomplex ompq; integer pskipped; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublereal aapp0, aapq1, temp1; integer i__, p, q; @@ -763,7 +763,7 @@ f"> */ extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical rotok; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -775,12 +775,12 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband, blskip; doublereal mxaapq; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal thsign, mxsinj; integer ir1; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); integer emptsw, notrot, iswrot, jbc; doublereal big; @@ -843,7 +843,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGSVJ0", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1663,6 +1663,6 @@ f"> */ /* L5991: */ } - return 0; + return; } /* zgsvj0_ */ diff --git a/lapack-netlib/SRC/zgsvj1.c b/lapack-netlib/SRC/zgsvj1.c index ec930a9627..fa1fc77c1d 100644 --- a/lapack-netlib/SRC/zgsvj1.c +++ b/lapack-netlib/SRC/zgsvj1.c @@ -751,7 +751,7 @@ f"> */ /* > Zlatko Drmac (Zagreb, Croatia) */ /* ===================================================================== */ -/* Subroutine */ int zgsvj1_(char *jobv, integer *m, integer *n, integer *n1, +/* Subroutine */ void zgsvj1_(char *jobv, integer *m, integer *n, integer *n1, doublecomplex *a, integer *lda, doublecomplex *d__, doublereal *sva, integer *mv, doublecomplex *v, integer *ldv, doublereal *eps, doublereal *sfmin, doublereal *tol, integer *nsweep, doublecomplex * @@ -772,7 +772,7 @@ f"> */ doublereal bigtheta; doublecomplex ompq; integer pskipped; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublereal aapp0, aapq1, temp1; integer i__, p, q; @@ -783,7 +783,7 @@ f"> */ extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical rotok; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -795,11 +795,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer ijblsk, swband, blskip; doublereal mxaapq; - extern /* Subroutine */ int zlascl_(char *, integer *, integer *, + extern /* Subroutine */ void zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal thsign, mxsinj; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); integer emptsw, notrot, iswrot, jbc; doublereal big; @@ -862,7 +862,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGSVJ1", &i__1, (ftnlen)6); - return 0; + return; } if (rsvec) { @@ -1347,6 +1347,6 @@ f"> */ } - return 0; + return; } /* zgsvj1_ */ diff --git a/lapack-netlib/SRC/zgtcon.c b/lapack-netlib/SRC/zgtcon.c index 5a81f69bac..bc5732d5ec 100644 --- a/lapack-netlib/SRC/zgtcon.c +++ b/lapack-netlib/SRC/zgtcon.c @@ -653,7 +653,7 @@ f"> */ /* > \ingroup complex16GTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgtcon_(char *norm, integer *n, doublecomplex *dl, +/* Subroutine */ void zgtcon_(char *norm, integer *n, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *du2, integer * ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) @@ -665,12 +665,12 @@ f"> */ integer kase, kase1, i__; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; logical onenrm; - extern /* Subroutine */ int zgttrs_(char *, integer *, integer *, + extern /* Subroutine */ void zgttrs_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , integer *, doublecomplex *, integer *, integer *); @@ -707,7 +707,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -715,9 +715,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } /* Check that D(1:N) is non-zero. */ @@ -726,7 +726,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; if (d__[i__2].r == 0. && d__[i__2].i == 0.) { - return 0; + return; } /* L10: */ } @@ -763,7 +763,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZGTCON */ diff --git a/lapack-netlib/SRC/zgtrfs.c b/lapack-netlib/SRC/zgtrfs.c index e47e86005e..a0d1374e65 100644 --- a/lapack-netlib/SRC/zgtrfs.c +++ b/lapack-netlib/SRC/zgtrfs.c @@ -724,7 +724,7 @@ f"> */ /* > \ingroup complex16GTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgtrfs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void zgtrfs_(char *trans, integer *n, integer *nrhs, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *dlf, doublecomplex *df, doublecomplex *duf, doublecomplex *du2, integer *ipiv, doublecomplex *b, integer *ldb, @@ -745,7 +745,7 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3], count; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlacn2_( integer *, doublecomplex *, doublecomplex *, doublereal *, @@ -753,14 +753,15 @@ f"> */ extern doublereal dlamch_(char *); integer nz; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlagtm_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlagtm_( char *, integer *, integer *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); logical notran; char transn[1], transt[1]; doublereal lstres; - extern /* Subroutine */ int zgttrs_(char *, integer *, integer *, + extern /* Subroutine */ void zgttrs_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -815,7 +816,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -827,7 +828,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1132,7 +1133,7 @@ f"> */ /* L110: */ } - return 0; + return; /* End of ZGTRFS */ diff --git a/lapack-netlib/SRC/zgtsv.c b/lapack-netlib/SRC/zgtsv.c index d99f679489..8a4da397a9 100644 --- a/lapack-netlib/SRC/zgtsv.c +++ b/lapack-netlib/SRC/zgtsv.c @@ -634,7 +634,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16GTsolve */ /* ===================================================================== */ -/* Subroutine */ int zgtsv_(integer *n, integer *nrhs, doublecomplex *dl, +/* Subroutine */ void zgtsv_(integer *n, integer *nrhs, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *b, integer *ldb, integer *info) { @@ -678,11 +678,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZGTSV ", &i__1, (ftnlen)6); - return 0; + return; } if (*n == 0) { - return 0; + return; } i__1 = *n - 1; @@ -699,7 +699,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* solution can not be found. */ *info = k; - return 0; + return; } } else /* if(complicated condition) */ { i__2 = k; @@ -787,7 +787,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ i__1 = *n; if (d__[i__1].r == 0. && d__[i__1].i == 0.) { *info = *n; - return 0; + return; } /* Back solve with the matrix U from the factorization. */ @@ -828,7 +828,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* L50: */ } - return 0; + return; /* End of ZGTSV */ diff --git a/lapack-netlib/SRC/zgtsvx.c b/lapack-netlib/SRC/zgtsvx.c index 6978634e51..0d8dec2a19 100644 --- a/lapack-netlib/SRC/zgtsvx.c +++ b/lapack-netlib/SRC/zgtsvx.c @@ -805,7 +805,7 @@ f"> */ /* > \ingroup complex16GTsolve */ /* ===================================================================== */ -/* Subroutine */ int zgtsvx_(char *fact, char *trans, integer *n, integer * +/* Subroutine */ void zgtsvx_(char *fact, char *trans, integer *n, integer * nrhs, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *dlf, doublecomplex *df, doublecomplex *duf, doublecomplex *du2, integer *ipiv, doublecomplex *b, integer *ldb, @@ -820,7 +820,7 @@ f"> */ char norm[1]; extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); logical nofact; @@ -828,7 +828,7 @@ f"> */ extern doublereal zlangt_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *); logical notran; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgtcon_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublereal *, @@ -894,7 +894,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -914,7 +914,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -951,7 +951,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZGTSVX */ diff --git a/lapack-netlib/SRC/zgttrf.c b/lapack-netlib/SRC/zgttrf.c index 323424af10..4dd8d5e98d 100644 --- a/lapack-netlib/SRC/zgttrf.c +++ b/lapack-netlib/SRC/zgttrf.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup complex16GTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgttrf_(integer *n, doublecomplex *dl, doublecomplex * +/* Subroutine */ void zgttrf_(integer *n, doublecomplex *dl, doublecomplex * d__, doublecomplex *du, doublecomplex *du2, integer *ipiv, integer * info) { @@ -671,13 +671,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("ZGTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize IPIV(i) = i and DU2(i) = 0 */ @@ -813,7 +813,7 @@ f"> */ } L50: - return 0; + return; /* End of ZGTTRF */ diff --git a/lapack-netlib/SRC/zgttrs.c b/lapack-netlib/SRC/zgttrs.c index e550c7e1e2..99ae1cb14c 100644 --- a/lapack-netlib/SRC/zgttrs.c +++ b/lapack-netlib/SRC/zgttrs.c @@ -651,7 +651,7 @@ f"> */ /* > \ingroup complex16GTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgttrs_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void zgttrs_(char *trans, integer *n, integer *nrhs, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *du2, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) @@ -661,10 +661,10 @@ f"> */ /* Local variables */ integer j, jb, nb; - extern /* Subroutine */ int zgtts2_(integer *, integer *, integer *, + extern /* Subroutine */ void zgtts2_(integer *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * - , integer *, doublecomplex *, integer *), xerbla_(char *, integer - *, ftnlen); + , integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer itrans; @@ -707,13 +707,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZGTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Decode TRANS */ @@ -756,6 +756,6 @@ f"> */ /* End of ZGTTRS */ - return 0; + return; } /* zgttrs_ */ diff --git a/lapack-netlib/SRC/zgtts2.c b/lapack-netlib/SRC/zgtts2.c index c1df66d969..235432563b 100644 --- a/lapack-netlib/SRC/zgtts2.c +++ b/lapack-netlib/SRC/zgtts2.c @@ -639,7 +639,7 @@ f"> */ /* > \ingroup complex16GTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zgtts2_(integer *itrans, integer *n, integer *nrhs, +/* Subroutine */ void zgtts2_(integer *itrans, integer *n, integer *nrhs, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *du2, integer *ipiv, doublecomplex *b, integer *ldb) { @@ -675,7 +675,7 @@ f"> */ /* Function Body */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (*itrans == 0) { @@ -1143,6 +1143,6 @@ f"> */ /* End of ZGTTS2 */ - return 0; + return; } /* zgtts2_ */ diff --git a/lapack-netlib/SRC/zhb2st_kernels.c b/lapack-netlib/SRC/zhb2st_kernels.c index d465485d6d..a7734ef644 100644 --- a/lapack-netlib/SRC/zhb2st_kernels.c +++ b/lapack-netlib/SRC/zhb2st_kernels.c @@ -680,7 +680,7 @@ kernels.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhb2st_kernels_(char *uplo, logical *wantz, integer * +/* Subroutine */ void zhb2st_kernels_(char *uplo, logical *wantz, integer * ttype, integer *st, integer *ed, integer *sweep, integer *n, integer * nb, integer *ib, doublecomplex *a, integer *lda, doublecomplex *v, doublecomplex *tau, integer *ldvt, doublecomplex *work) @@ -695,10 +695,10 @@ kernels.f"> */ extern logical lsame_(char *, char *); logical upper; integer j1, j2, lm, ln, ajeter; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer ofdpos; - extern /* Subroutine */ int zlarfx_(char *, integer *, integer *, + extern /* Subroutine */ void zlarfx_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlarfy_(char *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -918,7 +918,7 @@ kernels.f"> */ } } - return 0; + return; /* END OF ZHB2ST_KERNELS */ diff --git a/lapack-netlib/SRC/zhbev.c b/lapack-netlib/SRC/zhbev.c index ea576ff114..7d5b2ecf1c 100644 --- a/lapack-netlib/SRC/zhbev.c +++ b/lapack-netlib/SRC/zhbev.c @@ -666,7 +666,7 @@ atrices */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhbev_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void zhbev_(char *jobz, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, doublereal *rwork, integer *info) { @@ -679,7 +679,7 @@ atrices */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -692,7 +692,7 @@ atrices */ doublecomplex *, integer *, doublereal *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zhbtrd_(char *, char *, integer *, integer *, @@ -700,7 +700,7 @@ atrices */ doublecomplex *, integer *, doublecomplex *, integer *); integer indrwk; doublereal smlnum; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); doublereal eps; @@ -749,13 +749,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -770,7 +770,7 @@ atrices */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -831,7 +831,7 @@ atrices */ dscal_(&imax, &d__1, &w[1], &c__1); } - return 0; + return; /* End of ZHBEV */ diff --git a/lapack-netlib/SRC/zhbev_2stage.c b/lapack-netlib/SRC/zhbev_2stage.c index 9f25278605..986b0382e4 100644 --- a/lapack-netlib/SRC/zhbev_2stage.c +++ b/lapack-netlib/SRC/zhbev_2stage.c @@ -729,7 +729,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhbev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void zhbev_2stage_(char *jobz, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) @@ -744,12 +744,12 @@ stage.f"> */ integer *, integer *, integer *); doublereal anrm; integer imax; - extern /* Subroutine */ int zhetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void zhetrd_hb2st_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -765,14 +765,14 @@ stage.f"> */ doublecomplex *, integer *, doublereal *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer indwrk, indrwk, llwork; doublereal smlnum; logical lquery; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); doublereal eps; integer indhous; @@ -844,15 +844,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -867,7 +867,7 @@ stage.f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -937,7 +937,7 @@ stage.f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHBEV_2STAGE */ diff --git a/lapack-netlib/SRC/zhbevd.c b/lapack-netlib/SRC/zhbevd.c index 93a2ea78f6..00fd613f7a 100644 --- a/lapack-netlib/SRC/zhbevd.c +++ b/lapack-netlib/SRC/zhbevd.c @@ -731,7 +731,7 @@ f"> */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhbevd_(char *jobz, char *uplo, integer *n, integer *kd, +/* Subroutine */ void zhbevd_(char *jobz, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -746,12 +746,12 @@ f"> */ integer imax; doublereal rmin, rmax; integer llwk2; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -767,7 +767,7 @@ f"> */ doublecomplex *, integer *, doublereal *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zstedc_(char *, integer *, doublereal *, @@ -777,7 +777,7 @@ f"> */ doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lrwmin; doublereal smlnum; @@ -864,15 +864,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -882,7 +882,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -952,7 +952,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHBEVD */ diff --git a/lapack-netlib/SRC/zhbevd_2stage.c b/lapack-netlib/SRC/zhbevd_2stage.c index 5f0deecd7b..5a866844be 100644 --- a/lapack-netlib/SRC/zhbevd_2stage.c +++ b/lapack-netlib/SRC/zhbevd_2stage.c @@ -779,7 +779,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhbevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void zhbevd_2stage_(char *jobz, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, @@ -795,18 +795,18 @@ static integer c__1 = 1; integer *, integer *, integer *); doublereal anrm; integer imax; - extern /* Subroutine */ int zhetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void zhetrd_hb2st_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal rmin, rmax; integer llwk2; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo, indwk, lhtrd; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -822,7 +822,7 @@ static integer c__1 = 1; doublecomplex *, integer *, doublereal *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zstedc_(char *, integer *, doublereal *, @@ -830,7 +830,7 @@ static integer c__1 = 1; integer *, doublereal *, integer *, integer *, integer *, integer *); integer indrwk, liwmin; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lrwmin, llwork; doublereal smlnum; @@ -924,15 +924,15 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHBEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -942,7 +942,7 @@ static integer c__1 = 1; i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -1017,7 +1017,7 @@ static integer c__1 = 1; work[1].r = (doublereal) lwmin, work[1].i = 0.; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHBEVD_2STAGE */ diff --git a/lapack-netlib/SRC/zhbevx.c b/lapack-netlib/SRC/zhbevx.c index 2de6c2290c..6203b310dc 100644 --- a/lapack-netlib/SRC/zhbevx.c +++ b/lapack-netlib/SRC/zhbevx.c @@ -782,7 +782,7 @@ f"> */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhbevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void zhbevx_(char *jobz, char *range, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublecomplex *q, integer *ldq, doublereal *vl, doublereal *vu, integer *il, integer * iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, @@ -802,20 +802,20 @@ f"> */ logical test; doublecomplex ctmp1; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical lower; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical wantz; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jj; @@ -829,7 +829,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indiwk, indisp; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), dstebz_(char *, char *, integer *, doublereal @@ -840,11 +840,11 @@ f"> */ integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nsplit; doublereal smlnum; - extern /* Subroutine */ int zstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *), zsteqr_(char *, integer *, doublereal *, doublereal *, @@ -923,14 +923,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -955,7 +955,7 @@ f"> */ z__[i__1].r = 1., z__[i__1].i = 0.; } } - return 0; + return; } /* Get machine constants. */ @@ -1131,7 +1131,7 @@ f"> */ } } - return 0; + return; /* End of ZHBEVX */ diff --git a/lapack-netlib/SRC/zhbevx_2stage.c b/lapack-netlib/SRC/zhbevx_2stage.c index 6e16b15ebd..64e97ebd3a 100644 --- a/lapack-netlib/SRC/zhbevx_2stage.c +++ b/lapack-netlib/SRC/zhbevx_2stage.c @@ -845,7 +845,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhbevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void zhbevx_2stage_(char *jobz, char *range, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublecomplex *q, integer *ldq, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal * @@ -864,7 +864,7 @@ static integer c__1 = 1; integer *, integer *, integer *); doublereal anrm; integer imax; - extern /* Subroutine */ int zhetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void zhetrd_hb2st_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -872,23 +872,23 @@ static integer c__1 = 1; logical test; doublecomplex ctmp1; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; integer lhtrd; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; logical lower; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwtrd; logical wantz; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ib, jj; @@ -902,7 +902,7 @@ static integer c__1 = 1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal abstll, bignum; integer indiwk, indisp; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), dstebz_(char *, char *, integer *, doublereal @@ -910,15 +910,15 @@ static integer c__1 = 1; doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nsplit, llwork; doublereal smlnum; - extern /* Subroutine */ int zstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *); logical lquery; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); doublereal eps, vll, vuu; integer indhous; @@ -1018,16 +1018,16 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHBEVX_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1052,7 +1052,7 @@ static integer c__1 = 1; z__[i__1].r = 1., z__[i__1].i = 0.; } } - return 0; + return; } /* Get machine constants. */ @@ -1237,7 +1237,7 @@ static integer c__1 = 1; work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHBEVX_2STAGE */ diff --git a/lapack-netlib/SRC/zhbgst.c b/lapack-netlib/SRC/zhbgst.c index 58ff95ab0d..e37dd82b40 100644 --- a/lapack-netlib/SRC/zhbgst.c +++ b/lapack-netlib/SRC/zhbgst.c @@ -679,7 +679,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhbgst_(char *vect, char *uplo, integer *n, integer *ka, +/* Subroutine */ void zhbgst_(char *vect, char *uplo, integer *n, integer *ka, integer *kb, doublecomplex *ab, integer *ldab, doublecomplex *bb, integer *ldbb, doublecomplex *x, integer *ldx, doublecomplex *work, doublereal *rwork, integer *info) @@ -692,40 +692,41 @@ f"> */ /* Local variables */ integer inca; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); integer i__, j, k, l, m; doublecomplex t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer i0, i1; logical upper; integer i2, j1, j2; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical wantx; - extern /* Subroutine */ int zlar2v_(integer *, doublecomplex *, + extern /* Subroutine */ void zlar2v_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); doublecomplex ra; integer nr, nx; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); logical update; - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) ; integer ka1, kb1; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); doublecomplex ra1; - extern /* Subroutine */ int zlargv_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlargv_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *); integer j1t, j2t; - extern /* Subroutine */ int zlartv_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlartv_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); doublereal bii; @@ -782,13 +783,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } inca = *ldab * ka1; @@ -1818,14 +1819,14 @@ f"> */ --i__; i0 = m + 1; if (*ka == 0) { - return 0; + return; } goto L490; } } else { i__ -= *ka; if (i__ < 2) { - return 0; + return; } } diff --git a/lapack-netlib/SRC/zhbgv.c b/lapack-netlib/SRC/zhbgv.c index b3d0a2ae46..459f11cab6 100644 --- a/lapack-netlib/SRC/zhbgv.c +++ b/lapack-netlib/SRC/zhbgv.c @@ -691,7 +691,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhbgv_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void zhbgv_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, doublecomplex *ab, integer *ldab, doublecomplex *bb, integer *ldbb, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, doublereal *rwork, integer *info) @@ -705,13 +705,14 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern logical lsame_(char *, char *); integer iinfo; logical upper, wantz; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dsterf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dsterf_( integer *, doublereal *, doublereal *, integer *), zhbtrd_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer indwrk; - extern /* Subroutine */ int zhbgst_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhbgst_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, integer *), zpbstf_(char *, integer *, integer *, @@ -770,13 +771,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -784,7 +785,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -812,7 +813,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zsteqr_(jobz, n, &w[1], &rwork[inde], &z__[z_offset], ldz, &rwork[ indwrk], info); } - return 0; + return; /* End of ZHBGV */ diff --git a/lapack-netlib/SRC/zhbgvd.c b/lapack-netlib/SRC/zhbgvd.c index 685c5ef0eb..e7dd111f5e 100644 --- a/lapack-netlib/SRC/zhbgvd.c +++ b/lapack-netlib/SRC/zhbgvd.c @@ -764,7 +764,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int zhbgvd_(char *jobz, char *uplo, integer *n, integer *ka, +/* Subroutine */ void zhbgvd_(char *jobz, char *uplo, integer *n, integer *ka, integer *kb, doublecomplex *ab, integer *ldab, doublecomplex *bb, integer *ldbb, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal *rwork, integer * @@ -779,7 +779,7 @@ f"> */ integer llwk2; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -788,7 +788,8 @@ f"> */ integer llrwk; logical wantz; integer indwk2; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dsterf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dsterf_( integer *, doublereal *, doublereal *, integer *), zstedc_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer * , doublecomplex *, integer *, doublereal *, integer *, integer *, @@ -796,13 +797,13 @@ f"> */ integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer indwrk, liwmin; - extern /* Subroutine */ int zhbgst_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhbgst_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lrwmin; - extern /* Subroutine */ int zpbstf_(char *, integer *, integer *, + extern /* Subroutine */ void zpbstf_(char *, integer *, integer *, doublecomplex *, integer *, integer *); logical lquery; @@ -891,15 +892,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -907,7 +908,7 @@ f"> */ zpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -945,7 +946,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHBGVD */ diff --git a/lapack-netlib/SRC/zhbgvx.c b/lapack-netlib/SRC/zhbgvx.c index 0e2f75f943..75d981600b 100644 --- a/lapack-netlib/SRC/zhbgvx.c +++ b/lapack-netlib/SRC/zhbgvx.c @@ -813,7 +813,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int zhbgvx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void zhbgvx_(char *jobz, char *range, char *uplo, integer *n, integer *ka, integer *kb, doublecomplex *ab, integer *ldab, doublecomplex *bb, integer *ldbb, doublecomplex *q, integer *ldq, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal * @@ -833,12 +833,12 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper, wantz; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jj; @@ -847,7 +847,7 @@ f"> */ logical valeig; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer indiwk, indisp; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, @@ -856,13 +856,13 @@ f"> */ integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int zhbgst_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhbgst_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nsplit; - extern /* Subroutine */ int zpbstf_(char *, integer *, integer *, + extern /* Subroutine */ void zpbstf_(char *, integer *, integer *, doublecomplex *, integer *, integer *), zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, @@ -949,14 +949,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a split Cholesky factorization of B. */ @@ -964,7 +964,7 @@ f"> */ zpbstf_(uplo, n, kb, &bb[bb_offset], ldbb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem. */ @@ -1092,7 +1092,7 @@ f"> */ } } - return 0; + return; /* End of ZHBGVX */ diff --git a/lapack-netlib/SRC/zhbgvx.f b/lapack-netlib/SRC/zhbgvx.f index 79a3811b21..3832ed4141 100644 --- a/lapack-netlib/SRC/zhbgvx.f +++ b/lapack-netlib/SRC/zhbgvx.f @@ -327,7 +327,7 @@ SUBROUTINE ZHBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, UPPER, VALEIG, WANTZ CHARACTER ORDER, VECT - INTEGER I, IINFO, INDD, INDE, INDEE, INDIBL, INDISP, + INTEGER I, IINFO, INDD, INDE, INDEE, INDISP, $ INDIWK, INDRWK, INDWRK, ITMP1, J, JJ, NSPLIT DOUBLE PRECISION TMP1 * .. @@ -470,17 +470,16 @@ SUBROUTINE ZHBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWK = INDISP + N CALL DSTEBZ( RANGE, ORDER, N, VL, VU, IL, IU, ABSTOL, $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( 1 ), IWORK( INDISP ), RWORK( INDRWK ), $ IWORK( INDIWK ), INFO ) * IF( WANTZ ) THEN CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) * * Apply unitary matrix used in reduction to tridiagonal @@ -510,11 +509,11 @@ SUBROUTINE ZHBGVX( JOBZ, RANGE, UPLO, N, KA, KB, AB, LDAB, BB, 40 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/zhbtrd.c b/lapack-netlib/SRC/zhbtrd.c index 5e0b17ee20..641320b872 100644 --- a/lapack-netlib/SRC/zhbtrd.c +++ b/lapack-netlib/SRC/zhbtrd.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhbtrd_(char *vect, char *uplo, integer *n, integer *kd, +/* Subroutine */ void zhbtrd_(char *vect, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *d__, doublereal *e, doublecomplex *q, integer *ldq, doublecomplex *work, integer *info) { @@ -692,24 +692,25 @@ f"> */ doublereal abst; integer incx, last; doublecomplex temp; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); integer j1end, j1inc, i__, j, k, l; doublecomplex t; integer iqend; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); logical initq, wantq, upper; integer i2, j1, j2; - extern /* Subroutine */ int zlar2v_(integer *, doublecomplex *, + extern /* Subroutine */ void zlar2v_(integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); integer nq, nr, iqaend; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); integer kd1; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *), zlargv_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -767,13 +768,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHBTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize Q to the unit matrix, if needed */ @@ -1371,7 +1372,7 @@ f"> */ } } - return 0; + return; /* End of ZHBTRD */ diff --git a/lapack-netlib/SRC/zhecon.c b/lapack-netlib/SRC/zhecon.c index 9989d91ac5..e1ec4b856f 100644 --- a/lapack-netlib/SRC/zhecon.c +++ b/lapack-netlib/SRC/zhecon.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhecon_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhecon_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) { @@ -649,11 +649,11 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zhetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); @@ -691,7 +691,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHECON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -699,9 +699,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -713,7 +713,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -725,7 +725,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -751,7 +751,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZHECON */ diff --git a/lapack-netlib/SRC/zhecon_3.c b/lapack-netlib/SRC/zhecon_3.c index 2e18f8a604..97713d102e 100644 --- a/lapack-netlib/SRC/zhecon_3.c +++ b/lapack-netlib/SRC/zhecon_3.c @@ -678,7 +678,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhecon_3_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhecon_3_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) { @@ -686,16 +686,16 @@ static integer c__1 = 1; integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int zhetrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_3_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer kase, i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; @@ -733,7 +733,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHECON_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ @@ -741,9 +741,9 @@ static integer c__1 = 1; *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -755,7 +755,7 @@ static integer c__1 = 1; for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } } } else { @@ -766,7 +766,7 @@ static integer c__1 = 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } } } @@ -791,7 +791,7 @@ static integer c__1 = 1; *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZHECON_3 */ diff --git a/lapack-netlib/SRC/zhecon_rook.c b/lapack-netlib/SRC/zhecon_rook.c index debdac09be..82665a210b 100644 --- a/lapack-netlib/SRC/zhecon_rook.c +++ b/lapack-netlib/SRC/zhecon_rook.c @@ -652,7 +652,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhecon_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhecon_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) { @@ -660,16 +660,16 @@ rook.f"> */ integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int zhetrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_rook_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); integer kase, i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; @@ -706,7 +706,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHECON_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ @@ -714,9 +714,9 @@ rook.f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -728,7 +728,7 @@ rook.f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -740,7 +740,7 @@ rook.f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -766,7 +766,7 @@ rook.f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZHECON_ROOK */ diff --git a/lapack-netlib/SRC/zheequb.c b/lapack-netlib/SRC/zheequb.c index 7f6659ad5e..918272b8bd 100644 --- a/lapack-netlib/SRC/zheequb.c +++ b/lapack-netlib/SRC/zheequb.c @@ -645,7 +645,7 @@ static integer c__1 = 1; /* > Tech report version: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.1679 */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zheequb_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zheequb_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, doublecomplex *work, integer *info) { @@ -667,7 +667,7 @@ static integer c__1 = 1; logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum, smlnum; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal avg, std, tol; @@ -702,7 +702,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEQUB", &i__1, (ftnlen)7); - return 0; + return; } up = lsame_(uplo, "U"); *amax = 0.; @@ -711,7 +711,7 @@ static integer c__1 = 1; if (*n == 0) { *scond = 1.; - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -906,7 +906,7 @@ static integer c__1 = 1; d__ = c1 * c1 - c0 * 4 * c2; if (d__ <= 0.) { *info = -1; - return 0; + return; } si = c0 * -2 / (c1 + sqrt(d__)); d__ = si - s[i__]; @@ -993,6 +993,6 @@ static integer c__1 = 1; } *scond = f2cmax(smin,smlnum) / f2cmin(smax,bignum); - return 0; + return; } /* zheequb_ */ diff --git a/lapack-netlib/SRC/zheev.c b/lapack-netlib/SRC/zheev.c index 2b979e6849..fed8677b70 100644 --- a/lapack-netlib/SRC/zheev.c +++ b/lapack-netlib/SRC/zheev.c @@ -656,7 +656,7 @@ ices */ /* > \ingroup complex16HEeigen */ /* ===================================================================== */ -/* Subroutine */ int zheev_(char *jobz, char *uplo, integer *n, doublecomplex +/* Subroutine */ void zheev_(char *jobz, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) { @@ -669,7 +669,7 @@ ices */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -686,19 +686,19 @@ ices */ extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer indwrk; - extern /* Subroutine */ int zhetrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetrd_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, integer *); integer llwork; doublereal smlnum; integer lwkopt; logical lquery; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *), zungtr_(char *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal eps; @@ -757,15 +757,15 @@ ices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -776,7 +776,7 @@ ices */ i__1 = a_dim1 + 1; a[i__1].r = 1., a[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -842,7 +842,7 @@ ices */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHEEV */ diff --git a/lapack-netlib/SRC/zheev_2stage.c b/lapack-netlib/SRC/zheev_2stage.c index e9719f9764..6fd0b2b587 100644 --- a/lapack-netlib/SRC/zheev_2stage.c +++ b/lapack-netlib/SRC/zheev_2stage.c @@ -708,7 +708,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zheev_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void zheev_2stage_(char *jobz, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) { @@ -723,11 +723,11 @@ stage.f"> */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zhetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void zhetrd_2stage_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -744,14 +744,14 @@ stage.f"> */ extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *); integer indwrk, llwork; doublereal smlnum; logical lquery; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *), zungtr_(char *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal eps; @@ -813,15 +813,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -832,7 +832,7 @@ stage.f"> */ i__1 = a_dim1 + 1; a[i__1].r = 1., a[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -901,7 +901,7 @@ stage.f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHEEV_2STAGE */ diff --git a/lapack-netlib/SRC/zheevd.c b/lapack-netlib/SRC/zheevd.c index 7150e7838d..f4e9ef33cf 100644 --- a/lapack-netlib/SRC/zheevd.c +++ b/lapack-netlib/SRC/zheevd.c @@ -721,7 +721,7 @@ f"> */ /* > at Berkeley, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zheevd_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void zheevd_(char *jobz, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -736,7 +736,7 @@ f"> */ integer imax; doublereal rmin, rmax; integer lopt; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -755,7 +755,7 @@ f"> */ extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zstedc_(char *, integer *, doublereal *, @@ -763,7 +763,7 @@ f"> */ integer *, doublereal *, integer *, integer *, integer *, integer *); integer indrwk, indwrk, liwmin; - extern /* Subroutine */ int zhetrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetrd_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -771,7 +771,7 @@ f"> */ integer lrwmin, llwork; doublereal smlnum; logical lquery; - extern /* Subroutine */ int zunmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmtr_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -856,15 +856,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -874,7 +874,7 @@ f"> */ i__1 = a_dim1 + 1; a[i__1].r = 1., a[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -947,7 +947,7 @@ f"> */ rwork[1] = (doublereal) lropt; iwork[1] = liopt; - return 0; + return; /* End of ZHEEVD */ diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f index a6484eb032..7f58c7f726 100644 --- a/lapack-netlib/SRC/zheevd.f +++ b/lapack-netlib/SRC/zheevd.f @@ -284,7 +284,7 @@ SUBROUTINE ZHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LIWMIN = 1 END IF LOPT = MAX( LWMIN, N + - $ ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) ) + $ N*ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) ) LROPT = LRWMIN LIOPT = LIWMIN END IF diff --git a/lapack-netlib/SRC/zheevd_2stage.c b/lapack-netlib/SRC/zheevd_2stage.c index 6943c1d68a..a3abcfacad 100644 --- a/lapack-netlib/SRC/zheevd_2stage.c +++ b/lapack-netlib/SRC/zheevd_2stage.c @@ -772,7 +772,7 @@ static doublereal c_b28 = 1.; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zheevd_2stage_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void zheevd_2stage_(char *jobz, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -788,11 +788,11 @@ static doublereal c_b28 = 1.; doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zhetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void zhetrd_2stage_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -809,7 +809,7 @@ static doublereal c_b28 = 1.; extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex *, integer *, integer *), zstedc_(char *, integer *, doublereal *, @@ -817,12 +817,12 @@ static doublereal c_b28 = 1.; integer *, doublereal *, integer *, integer *, integer *, integer *); integer indrwk, indwrk, liwmin; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lrwmin, llwork; doublereal smlnum; logical lquery; - extern /* Subroutine */ int zunmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zunmtr_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -908,15 +908,15 @@ static doublereal c_b28 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEVD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -926,7 +926,7 @@ static doublereal c_b28 = 1.; i__1 = a_dim1 + 1; a[i__1].r = 1., a[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -1002,7 +1002,7 @@ static doublereal c_b28 = 1.; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHEEVD_2STAGE */ diff --git a/lapack-netlib/SRC/zheevr.c b/lapack-netlib/SRC/zheevr.c index f5dfaf9a22..16f10593ec 100644 --- a/lapack-netlib/SRC/zheevr.c +++ b/lapack-netlib/SRC/zheevr.c @@ -874,7 +874,7 @@ f"> */ /* > California at Berkeley, USA \n */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zheevr_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void zheevr_(char *jobz, char *range, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal * w, doublecomplex *z__, integer *ldz, integer *isuppz, doublecomplex * @@ -891,7 +891,7 @@ f"> */ doublereal rmin, rmax; logical test; integer itmp1, i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer indrd, indre; doublereal sigma; @@ -899,11 +899,11 @@ f"> */ integer iinfo; char order[1]; integer indwk; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; logical lower, wantz; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nb, jj; extern doublereal dlamch_(char *); @@ -913,32 +913,33 @@ f"> */ doublereal safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo, indwkn; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indrwk, liwmin; - extern /* Subroutine */ int zhetrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetrd_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, integer *); logical tryrac; integer lrwmin, llwrkn, llwork, nsplit; doublereal smlnum; - extern /* Subroutine */ int zstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *); logical lquery; integer lwkopt; extern doublereal zlansy_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zstemr_(char *, char *, integer *, doublereal + extern /* Subroutine */ void zstemr_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, integer *, logical *, doublereal *, integer *, integer *, integer @@ -1053,9 +1054,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEVR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1063,7 +1064,7 @@ f"> */ *m = 0; if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (*n == 1) { @@ -1087,7 +1088,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1307,7 +1308,7 @@ f"> */ rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHEEVR */ diff --git a/lapack-netlib/SRC/zheevr_2stage.c b/lapack-netlib/SRC/zheevr_2stage.c index 7042343cf1..8f524c77bb 100644 --- a/lapack-netlib/SRC/zheevr_2stage.c +++ b/lapack-netlib/SRC/zheevr_2stage.c @@ -922,7 +922,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zheevr_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void zheevr_2stage_(char *jobz, char *range, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, integer *ldz, integer *isuppz, @@ -941,25 +941,25 @@ static integer c_n1 = -1; doublereal rmin, rmax; logical test; integer itmp1, i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer indrd, indre; doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zhetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void zhetrd_2stage_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); char order[1]; integer indwk, lhtrd; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; logical lower; integer lwtrd; logical wantz; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ib, kd, jj; extern doublereal dlamch_(char *); @@ -969,14 +969,15 @@ static integer c_n1 = -1; doublereal safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal abstll, bignum; integer indtau, indisp; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); integer indiwo, indwkn; - extern /* Subroutine */ int dstebz_(char *, char *, integer *, doublereal + extern /* Subroutine */ void dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); @@ -984,13 +985,13 @@ static integer c_n1 = -1; logical tryrac; integer lrwmin, llwrkn, llwork, nsplit; doublereal smlnum; - extern /* Subroutine */ int zstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *); logical lquery; extern doublereal zlansy_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zstemr_(char *, char *, integer *, doublereal + extern /* Subroutine */ void zstemr_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, integer *, logical *, doublereal *, integer *, integer *, integer @@ -1099,9 +1100,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEVR_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -1109,7 +1110,7 @@ static integer c_n1 = -1; *m = 0; if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (*n == 1) { @@ -1133,7 +1134,7 @@ static integer c_n1 = -1; isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } /* Get machine constants. */ @@ -1355,7 +1356,7 @@ static integer c_n1 = -1; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHEEVR_2STAGE */ diff --git a/lapack-netlib/SRC/zheevx.c b/lapack-netlib/SRC/zheevx.c index 9648c2d092..c8e4d0b9a0 100644 --- a/lapack-netlib/SRC/zheevx.c +++ b/lapack-netlib/SRC/zheevx.c @@ -772,7 +772,7 @@ f"> */ /* > \ingroup complex16HEeigen */ /* ===================================================================== */ -/* Subroutine */ int zheevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void zheevx_(char *jobz, char *range, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal * w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer * @@ -790,16 +790,16 @@ f"> */ doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical lower, wantz; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nb, jj; extern doublereal dlamch_(char *); @@ -809,32 +809,33 @@ f"> */ doublereal safmin; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal abstll, bignum; extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); integer indiwk, indisp, indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int zhetrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetrd_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, integer *); integer lwkmin; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer llwork, nsplit; doublereal smlnum; - extern /* Subroutine */ int zstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *), zungtr_(char *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmtr_(char *, char *, char *, integer *, integer *, @@ -931,16 +932,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -961,7 +962,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -1144,7 +1145,7 @@ f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHEEVX */ diff --git a/lapack-netlib/SRC/zheevx_2stage.c b/lapack-netlib/SRC/zheevx_2stage.c index 851b4adf33..cc2503067d 100644 --- a/lapack-netlib/SRC/zheevx_2stage.c +++ b/lapack-netlib/SRC/zheevx_2stage.c @@ -822,7 +822,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zheevx_2stage_(char *jobz, char *range, char *uplo, +/* Subroutine */ void zheevx_2stage_(char *jobz, char *range, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex * @@ -842,24 +842,24 @@ static integer c__4 = 4; doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zhetrd_2stage_(char *, char *, integer *, + extern /* Subroutine */ void zhetrd_2stage_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer iinfo; char order[1]; integer lhtrd; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; logical lower; integer lwtrd; logical wantz; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ib, kd, jj; extern doublereal dlamch_(char *); @@ -867,27 +867,28 @@ static integer c__4 = 4; integer iscale, indibl; logical valeig; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal abstll, bignum; extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); integer indiwk, indisp, indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); integer indrwk, indwrk; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nsplit, llwork; doublereal smlnum; - extern /* Subroutine */ int zstein_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *); logical lquery; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *), zungtr_(char *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zunmtr_(char *, char *, char *, integer *, integer *, @@ -986,16 +987,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("ZHEEVX_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1016,7 +1017,7 @@ static integer c__4 = 4; i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -1202,7 +1203,7 @@ static integer c__4 = 4; work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHEEVX_2STAGE */ diff --git a/lapack-netlib/SRC/zhegs2.c b/lapack-netlib/SRC/zhegs2.c index ee1d674b31..ea7aaf9238 100644 --- a/lapack-netlib/SRC/zhegs2.c +++ b/lapack-netlib/SRC/zhegs2.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhegs2_(integer *itype, char *uplo, integer *n, +/* Subroutine */ void zhegs2_(integer *itype, char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { @@ -653,20 +653,21 @@ f"> */ doublecomplex z__1; /* Local variables */ - extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zher2_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer k; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrmv_( char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsv_(char * , char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ct; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublereal akk, bkk; @@ -708,7 +709,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGS2", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -886,7 +887,7 @@ f"> */ } } } - return 0; + return; /* End of ZHEGS2 */ diff --git a/lapack-netlib/SRC/zhegst.c b/lapack-netlib/SRC/zhegst.c index 5d70105677..d7517e27a0 100644 --- a/lapack-netlib/SRC/zhegst.c +++ b/lapack-netlib/SRC/zhegst.c @@ -645,7 +645,7 @@ f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhegst_(integer *itype, char *uplo, integer *n, +/* Subroutine */ void zhegst_(integer *itype, char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { @@ -656,11 +656,11 @@ f"> */ /* Local variables */ integer k; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zhemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zhemm_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, @@ -713,13 +713,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGST", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -903,7 +903,7 @@ f"> */ } } } - return 0; + return; /* End of ZHEGST */ diff --git a/lapack-netlib/SRC/zhegv.c b/lapack-netlib/SRC/zhegv.c index 5e5c46ba5a..77d2ddea4e 100644 --- a/lapack-netlib/SRC/zhegv.c +++ b/lapack-netlib/SRC/zhegv.c @@ -695,7 +695,7 @@ static integer c_n1 = -1; /* > \ingroup complex16HEeigen */ /* ===================================================================== */ -/* Subroutine */ int zhegv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void zhegv_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *info) @@ -706,12 +706,12 @@ static integer c_n1 = -1; /* Local variables */ integer neig; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zheev_(char *, char *, integer *, + extern /* Subroutine */ void zheev_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); char trans[1]; logical upper, wantz; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, @@ -721,7 +721,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zhegst_(integer *, char *, integer *, + extern /* Subroutine */ void zhegst_(integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer lwkopt; logical lquery; @@ -789,15 +789,15 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -805,7 +805,7 @@ static integer c_n1 = -1; zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -854,7 +854,7 @@ static integer c_n1 = -1; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHEGV */ diff --git a/lapack-netlib/SRC/zhegv_2stage.c b/lapack-netlib/SRC/zhegv_2stage.c index 80606d6347..9ae891db19 100644 --- a/lapack-netlib/SRC/zhegv_2stage.c +++ b/lapack-netlib/SRC/zhegv_2stage.c @@ -749,7 +749,7 @@ stage.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhegv_2stage_(integer *itype, char *jobz, char *uplo, +/* Subroutine */ void zhegv_2stage_(integer *itype, char *jobz, char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal * rwork, integer *info) @@ -761,7 +761,7 @@ stage.f"> */ integer neig; extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); - extern /* Subroutine */ int zheev_2stage_(char *, char *, integer *, + extern /* Subroutine */ void zheev_2stage_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); extern logical lsame_(char *, char *); @@ -770,14 +770,15 @@ stage.f"> */ logical upper; integer lwtrd; logical wantz; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer ib, kd; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zhegst_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zhegst_( integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); logical lquery; @@ -847,15 +848,15 @@ stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGV_2STAGE ", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -863,7 +864,7 @@ stage.f"> */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -912,7 +913,7 @@ stage.f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHEGV_2STAGE */ diff --git a/lapack-netlib/SRC/zhegvd.c b/lapack-netlib/SRC/zhegvd.c index 6c0ca98910..6ce5b95523 100644 --- a/lapack-netlib/SRC/zhegvd.c +++ b/lapack-netlib/SRC/zhegvd.c @@ -761,7 +761,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhegvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void zhegvd_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *w, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -779,18 +779,19 @@ f"> */ logical upper; integer lropt; logical wantz; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *), xerbla_(char *, - integer *, ftnlen), zheevd_(char *, char *, integer *, + integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zheevd_(char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *, integer *); integer liwmin; - extern /* Subroutine */ int zhegst_(integer *, char *, integer *, + extern /* Subroutine */ void zhegst_(integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer lrwmin; logical lquery; @@ -874,15 +875,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -890,7 +891,7 @@ f"> */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -946,7 +947,7 @@ f"> */ rwork[1] = (doublereal) lropt; iwork[1] = liopt; - return 0; + return; /* End of ZHEGVD */ diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f index 2e92255df8..eeda656ad1 100644 --- a/lapack-netlib/SRC/zhegvd.f +++ b/lapack-netlib/SRC/zhegvd.f @@ -360,9 +360,9 @@ SUBROUTINE ZHEGVD( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO ) CALL ZHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK, $ IWORK, LIWORK, INFO ) - LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) - LROPT = MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) - LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) + LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) ) + LROPT = INT( MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) ) + LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ .AND. INFO.EQ.0 ) THEN * diff --git a/lapack-netlib/SRC/zhegvx.c b/lapack-netlib/SRC/zhegvx.c index 28f628af06..77e5ddd74d 100644 --- a/lapack-netlib/SRC/zhegvx.c +++ b/lapack-netlib/SRC/zhegvx.c @@ -820,7 +820,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int zhegvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void zhegvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *vl, doublereal *vu, integer *il, integer * iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, @@ -834,7 +834,7 @@ f"> */ extern logical lsame_(char *, char *); char trans[1]; logical upper, wantz; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, @@ -845,7 +845,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zhegst_(integer *, char *, integer *, + extern /* Subroutine */ void zhegst_(integer *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zheevx_(char *, char *, char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, doublecomplex * @@ -944,16 +944,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHEGVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -961,7 +961,7 @@ f"> */ zpotrf_(uplo, n, &b[b_offset], ldb, info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -1012,7 +1012,7 @@ f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHEGVX */ diff --git a/lapack-netlib/SRC/zherfs.c b/lapack-netlib/SRC/zherfs.c index 9dbcd7c6f2..47496c5198 100644 --- a/lapack-netlib/SRC/zherfs.c +++ b/lapack-netlib/SRC/zherfs.c @@ -705,7 +705,7 @@ f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zherfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zherfs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, @@ -724,11 +724,11 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3], count; - extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlacn2_( integer *, doublecomplex *, doublecomplex *, doublereal *, @@ -739,7 +739,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zhetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -796,7 +796,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHERFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -808,7 +808,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1042,7 +1042,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZHERFS */ diff --git a/lapack-netlib/SRC/zhesv.c b/lapack-netlib/SRC/zhesv.c index d7bc12465a..64c94960f7 100644 --- a/lapack-netlib/SRC/zhesv.c +++ b/lapack-netlib/SRC/zhesv.c @@ -684,7 +684,7 @@ static integer c_n1 = -1; /* > \ingroup complex16HEsolve */ /* ===================================================================== */ -/* Subroutine */ int zhesv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhesv_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -697,12 +697,12 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zhetrf_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetrf_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int zhetrs2_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs2_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -759,9 +759,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHESV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**H or A = L*D*L**H. */ @@ -791,7 +791,7 @@ static integer c_n1 = -1; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHESV */ diff --git a/lapack-netlib/SRC/zhesv_aa.c b/lapack-netlib/SRC/zhesv_aa.c index 1384fdee24..5878e34707 100644 --- a/lapack-netlib/SRC/zhesv_aa.c +++ b/lapack-netlib/SRC/zhesv_aa.c @@ -674,7 +674,7 @@ a.f"> */ /* > \ingroup complex16HEsolve */ /* ===================================================================== */ -/* Subroutine */ int zhesv_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhesv_aa_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -684,10 +684,11 @@ a.f"> */ /* Local variables */ integer lwkopt_hetrf__, lwkopt_hetrs__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zhetrf_aa_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zhetrf_aa_(char *, integer *, doublecomplex * , integer *, integer *, doublecomplex *, integer *, integer *), zhetrs_aa_(char *, integer *, integer *, doublecomplex * , integer *, integer *, doublecomplex *, integer *, doublecomplex - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -748,9 +749,9 @@ a.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHESV_AA ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U**H*T*U or A = L*T*L**H. */ @@ -767,7 +768,7 @@ a.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHESV_AA */ diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.c b/lapack-netlib/SRC/zhesv_aa_2stage.c index f435609782..e2665440d9 100644 --- a/lapack-netlib/SRC/zhesv_aa_2stage.c +++ b/lapack-netlib/SRC/zhesv_aa_2stage.c @@ -699,7 +699,7 @@ a_2stage.f"> */ /* > \ingroup complex16HEsolve */ /* ===================================================================== */ -/* Subroutine */ int zhesv_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhesv_aa_2stage_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *tb, integer *ltb, integer *ipiv, integer *ipiv2, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) @@ -708,7 +708,7 @@ a_2stage.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zhetrf_aa_2stage_(char *, integer *, + extern /* Subroutine */ void zhetrf_aa_2stage_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, integer *, integer *), zhetrs_aa_2stage_(char *, integer *, integer *, doublecomplex *, @@ -775,9 +775,9 @@ a_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHESV_AA_2STAGE", &i__1, (ftnlen)15); - return 0; + return; } else if (wquery || tquery) { - return 0; + return; } /* Compute the factorization A = U**H*T*U or A = L*T*L**H. */ @@ -795,7 +795,7 @@ a_2stage.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHESV_AA_2STAGE */ diff --git a/lapack-netlib/SRC/zhesv_rk.c b/lapack-netlib/SRC/zhesv_rk.c index b95143bcb3..36079d8cfd 100644 --- a/lapack-netlib/SRC/zhesv_rk.c +++ b/lapack-netlib/SRC/zhesv_rk.c @@ -740,7 +740,7 @@ k.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhesv_rk_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhesv_rk_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) @@ -749,13 +749,14 @@ k.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zhetrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_3_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int zhetrf_rk_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zhetrf_rk_(char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer - *, integer *), xerbla_(char *, integer *, ftnlen); + *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -813,9 +814,9 @@ k.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHESV_RK ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = P*U*D*(U**H)*(P**T) or */ @@ -835,7 +836,7 @@ k.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHESV_RK */ diff --git a/lapack-netlib/SRC/zhesv_rk.f b/lapack-netlib/SRC/zhesv_rk.f index 1ec75cc04b..6333e9f363 100644 --- a/lapack-netlib/SRC/zhesv_rk.f +++ b/lapack-netlib/SRC/zhesv_rk.f @@ -280,7 +280,7 @@ SUBROUTINE ZHESV_RK( UPLO, N, NRHS, A, LDA, E, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL ZHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zhesv_rook.c b/lapack-netlib/SRC/zhesv_rook.c index 05541e18f1..4c166c1233 100644 --- a/lapack-netlib/SRC/zhesv_rook.c +++ b/lapack-netlib/SRC/zhesv_rook.c @@ -719,7 +719,7 @@ ook.f"> */ /* ===================================================================== */ -/* Subroutine */ int zhesv_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhesv_rook_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -727,7 +727,7 @@ ook.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zhetrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_rook_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); @@ -737,7 +737,7 @@ ook.f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zhetrf_rook_(char *, integer *, + extern /* Subroutine */ void zhetrf_rook_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); @@ -794,9 +794,9 @@ ook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHESV_ROOK ", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**H or A = L*D*L**H. */ @@ -815,7 +815,7 @@ ook.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHESV_ROOK */ diff --git a/lapack-netlib/SRC/zhesvx.c b/lapack-netlib/SRC/zhesvx.c index 4de9033d05..a71a0745c3 100644 --- a/lapack-netlib/SRC/zhesvx.c +++ b/lapack-netlib/SRC/zhesvx.c @@ -797,7 +797,7 @@ f"> */ /* > \ingroup complex16HEsolve */ /* ===================================================================== */ -/* Subroutine */ int zhesvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zhesvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, @@ -818,7 +818,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zhecon_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhecon_(char *, integer *, doublecomplex *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zherfs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, @@ -909,9 +909,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHESVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } if (nofact) { @@ -926,7 +926,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -960,7 +960,7 @@ f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHESVX */ diff --git a/lapack-netlib/SRC/zhesvxx.c b/lapack-netlib/SRC/zhesvxx.c index 696c9f306b..54b52afc2c 100644 --- a/lapack-netlib/SRC/zhesvxx.c +++ b/lapack-netlib/SRC/zhesvxx.c @@ -1012,7 +1012,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16HEsolve */ /* ===================================================================== */ -/* Subroutine */ int zhesvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zhesvxx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, integer *ipiv, char *equed, doublereal *s, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, @@ -1040,14 +1040,14 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical nofact; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int zlaqhe_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqhe_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, char *); integer infequ; - extern /* Subroutine */ int zhetrf_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetrf_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zhetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zlascl2_(integer *, integer *, doublereal *, doublecomplex *, integer *), zheequb_(char *, integer *, @@ -1167,7 +1167,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZHESVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1212,7 +1212,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = zla_herpvgrw_(uplo, n, info, &a[a_offset], lda, & af[af_offset], ldaf, &ipiv[1], &rwork[1]); } - return 0; + return; } } @@ -1244,7 +1244,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zlascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of ZHESVXX */ diff --git a/lapack-netlib/SRC/zheswapr.c b/lapack-netlib/SRC/zheswapr.c index d8ec5b1b06..999b3f43bc 100644 --- a/lapack-netlib/SRC/zheswapr.c +++ b/lapack-netlib/SRC/zheswapr.c @@ -616,7 +616,7 @@ r.f"> */ /* > \ingroup complex16HEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zheswapr_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zheswapr_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *i1, integer *i2) { /* System generated locals */ @@ -627,7 +627,7 @@ r.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex tmp; @@ -749,6 +749,6 @@ r.f"> */ } } - return 0; + return; } /* zheswapr_ */ diff --git a/lapack-netlib/SRC/zhetd2.c b/lapack-netlib/SRC/zhetd2.c index 6e6caad797..af8d2532e9 100644 --- a/lapack-netlib/SRC/zhetd2.c +++ b/lapack-netlib/SRC/zhetd2.c @@ -690,7 +690,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhetd2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetd2_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *d__, doublereal *e, doublecomplex *tau, integer *info) { @@ -701,7 +701,7 @@ f"> */ /* Local variables */ doublecomplex taui; - extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zher2_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer i__; @@ -709,13 +709,14 @@ f"> */ extern logical lsame_(char *, char *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *), xerbla_( - char *, integer *, ftnlen), zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, + doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -751,13 +752,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETD2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -912,7 +913,7 @@ f"> */ d__[i__1] = a[i__2].r; } - return 0; + return; /* End of ZHETD2 */ diff --git a/lapack-netlib/SRC/zhetf2.c b/lapack-netlib/SRC/zhetf2.c index 6afddd6126..0ded5c3a7d 100644 --- a/lapack-netlib/SRC/zhetf2.c +++ b/lapack-netlib/SRC/zhetf2.c @@ -706,7 +706,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetf2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetf2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -716,7 +716,7 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ void zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal d__; integer i__, j, k; @@ -726,7 +726,7 @@ f"> */ integer kstep; logical upper; doublereal r1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlapy2_(doublereal *, doublereal *); doublereal d11; @@ -738,7 +738,8 @@ f"> */ doublecomplex wk; doublereal tt; extern logical disnan_(doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -776,7 +777,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETF2", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1379,7 +1380,7 @@ f"> */ } L90: - return 0; + return; /* End of ZHETF2 */ diff --git a/lapack-netlib/SRC/zhetf2_rk.c b/lapack-netlib/SRC/zhetf2_rk.c index df630973f6..95131cfca7 100644 --- a/lapack-netlib/SRC/zhetf2_rk.c +++ b/lapack-netlib/SRC/zhetf2_rk.c @@ -755,7 +755,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetf2_rk_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetf2_rk_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -766,7 +766,7 @@ rk.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ void zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal d__; integer i__, j, k, p; @@ -777,7 +777,7 @@ rk.f"> */ integer itemp, kstep; logical upper; doublereal r1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlapy2_(doublereal *, doublereal *); doublereal d11; @@ -790,7 +790,8 @@ rk.f"> */ doublereal absakk; doublecomplex wk; doublereal tt; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -830,7 +831,7 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETF2_RK", &i__1, (ftnlen)9); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1839,7 +1840,7 @@ rk.f"> */ ; } - return 0; + return; /* End of ZHETF2_RK */ diff --git a/lapack-netlib/SRC/zhetf2_rook.c b/lapack-netlib/SRC/zhetf2_rook.c index 2b5e16ee64..40ebc9661c 100644 --- a/lapack-netlib/SRC/zhetf2_rook.c +++ b/lapack-netlib/SRC/zhetf2_rook.c @@ -708,7 +708,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetf2_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetf2_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -719,7 +719,7 @@ rook.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ void zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal d__; integer i__, j, k, p; @@ -730,7 +730,7 @@ rook.f"> */ integer itemp, kstep; logical upper; doublereal r1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlapy2_(doublereal *, doublereal *); doublereal d11; @@ -743,7 +743,8 @@ rook.f"> */ doublereal absakk; doublecomplex wk; doublereal tt; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -782,7 +783,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETF2_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1685,7 +1686,7 @@ rook.f"> */ L70: - return 0; + return; /* End of ZHETF2_ROOK */ diff --git a/lapack-netlib/SRC/zhetrd.c b/lapack-netlib/SRC/zhetrd.c index 6cf300d02c..a08e38a9cf 100644 --- a/lapack-netlib/SRC/zhetrd.c +++ b/lapack-netlib/SRC/zhetrd.c @@ -709,7 +709,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhetrd_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetrd_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *d__, doublereal *e, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -722,7 +722,7 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int zhetd2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetd2_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zher2k_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); @@ -730,7 +730,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlatrd_(char *, integer *, integer *, + extern /* Subroutine */ void zlatrd_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, doublecomplex *, integer *); integer ldwork, lwkopt; @@ -785,16 +785,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nx = *n; @@ -931,7 +931,7 @@ f"> */ } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHETRD */ diff --git a/lapack-netlib/SRC/zhetrd_2stage.c b/lapack-netlib/SRC/zhetrd_2stage.c index a8946c38e8..1d52bdaea1 100644 --- a/lapack-netlib/SRC/zhetrd_2stage.c +++ b/lapack-netlib/SRC/zhetrd_2stage.c @@ -740,7 +740,7 @@ static integer c__4 = 4; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhetrd_2stage_(char *vect, char *uplo, integer *n, +/* Subroutine */ void zhetrd_2stage_(char *vect, char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *d__, doublereal *e, doublecomplex *tau, doublecomplex *hous2, integer *lhous2, doublecomplex *work, integer *lwork, integer *info) @@ -750,12 +750,12 @@ static integer c__4 = 4; /* Local variables */ integer ldab; - extern /* Subroutine */ int zhetrd_he2hb_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrd_he2hb_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); extern integer ilaenv2stage_(integer *, char *, char *, integer *, integer *, integer *, integer *); - extern /* Subroutine */ int zhetrd_hb2st_(char *, char *, char *, + extern /* Subroutine */ void zhetrd_hb2st_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -826,16 +826,16 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRD_2STAGE", &i__1, (ftnlen)13); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Determine pointer position */ @@ -849,20 +849,20 @@ static integer c__4 = 4; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRD_HE2HB", &i__1, (ftnlen)12); - return 0; + return; } zhetrd_hb2st_("Y", vect, uplo, n, &kd, &work[abpos], &ldab, &d__[1], &e[ 1], &hous2[1], lhous2, &work[wpos], &lwrk, info); if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRD_HB2ST", &i__1, (ftnlen)12); - return 0; + return; } hous2[1].r = (doublereal) lhmin, hous2[1].i = 0.; work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHETRD_2STAGE */ diff --git a/lapack-netlib/SRC/zhetrd_hb2st.c b/lapack-netlib/SRC/zhetrd_hb2st.c index 07bd66764c..cac1d995ef 100644 --- a/lapack-netlib/SRC/zhetrd_hb2st.c +++ b/lapack-netlib/SRC/zhetrd_hb2st.c @@ -746,7 +746,7 @@ hb2st.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhetrd_hb2st_(char *stage1, char *vect, char *uplo, +/* Subroutine */ void zhetrd_hb2st_(char *stage1, char *vect, char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *d__, doublereal *e, doublecomplex *hous, integer *lhous, doublecomplex *work, integer *lwork, integer *info) @@ -770,14 +770,14 @@ hb2st.f"> */ integer thgrid, thgrnb, indtau; doublereal abstmp; integer ofdpos; - extern /* Subroutine */ int zhb2st_kernels_(char *, logical *, integer *, + extern /* Subroutine */ void zhb2st_kernels_(char *, logical *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer blklastind; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); logical lquery, afters1; integer lda, tid, ldv; doublecomplex tmp; @@ -848,9 +848,9 @@ hb2st.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRD_HB2ST", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -858,7 +858,7 @@ hb2st.f"> */ if (*n == 0) { hous[1].r = 1., hous[1].i = 0.; work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Determine pointer position */ @@ -912,7 +912,7 @@ hb2st.f"> */ hous[1].r = 1., hous[1].i = 0.; work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Case KD=1: */ @@ -993,7 +993,7 @@ hb2st.f"> */ hous[1].r = 1., hous[1].i = 0.; work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Main code start here. */ @@ -1115,7 +1115,7 @@ hb2st.f"> */ hous[1].r = (doublereal) lhmin, hous[1].i = 0.; work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHETRD_HB2ST */ diff --git a/lapack-netlib/SRC/zhetrd_he2hb.c b/lapack-netlib/SRC/zhetrd_he2hb.c index 235c493f82..669ca5c59a 100644 --- a/lapack-netlib/SRC/zhetrd_he2hb.c +++ b/lapack-netlib/SRC/zhetrd_he2hb.c @@ -760,7 +760,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhetrd_he2hb_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zhetrd_he2hb_(char *uplo, integer *n, integer *kd, doublecomplex *a, integer *lda, doublecomplex *ab, integer *ldab, doublecomplex *tau, doublecomplex *work, integer *lwork, integer * info) @@ -776,7 +776,7 @@ f"> */ integer tpos, wpos, s1pos, s2pos, i__, j; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zhemm_(char *, char *, integer *, @@ -785,13 +785,14 @@ f"> */ integer *); integer lwmin; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zher2k_(char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); integer lk, pk, pn, lt, lw; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgelqf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgelqf_( integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zgeqrf_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -854,10 +855,10 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRD_HE2HB", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ @@ -886,7 +887,7 @@ f"> */ } } work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Determine the pointer position for the workspace */ @@ -1082,7 +1083,7 @@ f"> */ } work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZHETRD_HE2HB */ diff --git a/lapack-netlib/SRC/zhetrf.c b/lapack-netlib/SRC/zhetrf.c index acd6c9f4b7..a52c02a65a 100644 --- a/lapack-netlib/SRC/zhetrf.c +++ b/lapack-netlib/SRC/zhetrf.c @@ -693,7 +693,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhetrf_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetrf_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -705,12 +705,13 @@ f"> */ extern logical lsame_(char *, char *); integer nbmin, iinfo; logical upper; - extern /* Subroutine */ int zhetf2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetf2_(char *, integer *, doublecomplex *, integer *, integer *, integer *); integer kb, nb; - extern /* Subroutine */ int zlahef_(char *, integer *, integer *, integer + extern /* Subroutine */ void zlahef_(char *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; @@ -763,9 +764,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -892,7 +893,7 @@ f"> */ L40: work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHETRF */ diff --git a/lapack-netlib/SRC/zhetrf_aa.c b/lapack-netlib/SRC/zhetrf_aa.c index 0bfbfd017b..b3c663bb5b 100644 --- a/lapack-netlib/SRC/zhetrf_aa.c +++ b/lapack-netlib/SRC/zhetrf_aa.c @@ -647,7 +647,7 @@ aa.f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetrf_aa_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetrf_aa_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -659,18 +659,18 @@ aa.f"> */ /* Local variables */ integer j; doublecomplex alpha; - extern /* Subroutine */ int zlahef_aa_(char *, integer *, integer *, + extern /* Subroutine */ void zlahef_aa_(char *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, doublecomplex *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; integer k1, k2, j1, j2, j3; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jb, nb, mj, nj; @@ -731,15 +731,15 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRF_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } ipiv[1] = 1; if (*n == 1) { @@ -747,7 +747,7 @@ aa.f"> */ i__2 = a_dim1 + 1; d__1 = a[i__2].r; a[i__1].r = d__1, a[i__1].i = 0.; - return 0; + return; } /* Adjust block size based on the workspace size */ @@ -1048,7 +1048,7 @@ aa.f"> */ } L20: - return 0; + return; /* End of ZHETRF_AA */ diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.c b/lapack-netlib/SRC/zhetrf_aa_2stage.c index b88596a5df..a21c6c472b 100644 --- a/lapack-netlib/SRC/zhetrf_aa_2stage.c +++ b/lapack-netlib/SRC/zhetrf_aa_2stage.c @@ -676,7 +676,7 @@ aa_2stage.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetrf_aa_2stage_(char *uplo, integer *n, doublecomplex +/* Subroutine */ void zhetrf_aa_2stage_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *tb, integer *ltb, integer *ipiv, integer *ipiv2, doublecomplex *work, integer *lwork, integer *info) { @@ -689,14 +689,14 @@ aa_2stage.f"> */ integer ldtb, i__, j, k; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i1; logical upper; integer i2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, @@ -705,11 +705,13 @@ aa_2stage.f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) , zgbtrf_(integer *, integer *, integer *, integer *, - doublecomplex *, integer *, integer *, integer *), zgetrf_( + doublecomplex *, integer *, integer *, integer *); + extern int zgetrf_( integer *, integer *, doublecomplex *, integer *, integer *, - integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, + integer *); + extern void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zhegst_(integer *, char *, @@ -760,7 +762,7 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRF_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Answer the query */ @@ -778,13 +780,13 @@ aa_2stage.f"> */ } } if (tquery || wquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } /* Determine the number of the block size */ @@ -1279,7 +1281,7 @@ aa_2stage.f"> */ /* Factor the band matrix */ zgbtrf_(n, n, &nb, &nb, &tb[1], &ldtb, &ipiv2[1], info); - return 0; + return; /* End of ZHETRF_AA_2STAGE */ diff --git a/lapack-netlib/SRC/zhetrf_rk.c b/lapack-netlib/SRC/zhetrf_rk.c index 5e2148a7da..b924045792 100644 --- a/lapack-netlib/SRC/zhetrf_rk.c +++ b/lapack-netlib/SRC/zhetrf_rk.c @@ -774,7 +774,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetrf_rk_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetrf_rk_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -783,15 +783,15 @@ rk.f"> */ /* Local variables */ integer i__, k; - extern /* Subroutine */ int zhetf2_rk_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zhetf2_rk_(char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmin, iinfo; - extern /* Subroutine */ int zlahef_rk_(char *, integer *, integer *, + extern /* Subroutine */ void zlahef_rk_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer kb, nb, ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -848,9 +848,9 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRF_RK", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -1036,7 +1036,7 @@ rk.f"> */ } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHETRF_RK */ diff --git a/lapack-netlib/SRC/zhetrf_rook.c b/lapack-netlib/SRC/zhetrf_rook.c index 9c2593c5c1..f4a2b2def6 100644 --- a/lapack-netlib/SRC/zhetrf_rook.c +++ b/lapack-netlib/SRC/zhetrf_rook.c @@ -728,7 +728,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetrf_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetrf_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -746,10 +746,10 @@ rook.f"> */ integer *, integer *, ftnlen, ftnlen); integer ldwork, lwkopt; logical lquery; - extern /* Subroutine */ int zhetf2_rook_(char *, integer *, + extern /* Subroutine */ void zhetf2_rook_(char *, integer *, doublecomplex *, integer *, integer *, integer *); integer iws; - extern /* Subroutine */ int zlahef_rook_(char *, integer *, integer *, + extern /* Subroutine */ void zlahef_rook_(char *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); @@ -801,9 +801,9 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRF_ROOK", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -933,7 +933,7 @@ rook.f"> */ L40: work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHETRF_ROOK */ diff --git a/lapack-netlib/SRC/zhetri.c b/lapack-netlib/SRC/zhetri.c index 264bccfdd2..9b7511b550 100644 --- a/lapack-netlib/SRC/zhetri.c +++ b/lapack-netlib/SRC/zhetri.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetri_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetri_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *info) { /* System generated locals */ @@ -646,11 +646,11 @@ f"> */ extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal ak; @@ -690,13 +690,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -708,7 +708,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -720,7 +720,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -1053,7 +1053,7 @@ f"> */ ; } - return 0; + return; /* End of ZHETRI */ diff --git a/lapack-netlib/SRC/zhetri2.c b/lapack-netlib/SRC/zhetri2.c index 0ccaf9fcf7..9dc0f8b540 100644 --- a/lapack-netlib/SRC/zhetri2.c +++ b/lapack-netlib/SRC/zhetri2.c @@ -642,7 +642,7 @@ static integer c_n1 = -1; /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetri2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetri2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -650,7 +650,7 @@ static integer c_n1 = -1; integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zhetri2x_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetri2x_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmax; @@ -658,7 +658,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zhetri_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhetri_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *); logical lquery; integer minsize; @@ -711,13 +711,13 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRI2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { work[1].r = (doublereal) minsize, work[1].i = 0.; - return 0; + return; } if (*n == 0) { - return 0; + return; } if (nbmax >= *n) { zhetri_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], info); @@ -725,7 +725,7 @@ static integer c_n1 = -1; zhetri2x_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], &nbmax, info); } - return 0; + return; /* End of ZHETRI2 */ diff --git a/lapack-netlib/SRC/zhetri2x.c b/lapack-netlib/SRC/zhetri2x.c index 2fcf271734..fa41c2a5d2 100644 --- a/lapack-netlib/SRC/zhetri2x.c +++ b/lapack-netlib/SRC/zhetri2x.c @@ -635,7 +635,7 @@ x.f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetri2x_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetri2x_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *nb, integer *info) { @@ -648,31 +648,32 @@ x.f"> */ /* Local variables */ integer invd; doublecomplex akkp1; - extern /* Subroutine */ int zheswapr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zheswapr_(char *, integer *, doublecomplex *, integer *, integer *, integer *); doublecomplex d__; integer i__, j, k; doublecomplex t; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer count; logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, u01_i_j__; integer u11; doublecomplex u11_i_j__; integer ip; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ztrtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int ztrtri_( char *, char *, integer *, doublecomplex *, integer *, integer *); integer nnb, cut; doublecomplex akp1, u01_ip1_j__, u11_ip1_j__; - extern /* Subroutine */ int zsyconv_(char *, char *, integer *, + extern /* Subroutine */ void zsyconv_(char *, char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *); @@ -713,10 +714,10 @@ x.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRI2X", &i__1, (ftnlen)8); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Convert A */ @@ -734,7 +735,7 @@ x.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } } } else { @@ -745,7 +746,7 @@ x.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } } } @@ -1388,7 +1389,7 @@ x.f"> */ } } - return 0; + return; /* End of ZHETRI2X */ diff --git a/lapack-netlib/SRC/zhetri_3.c b/lapack-netlib/SRC/zhetri_3.c index 64d52d94cf..47d2c63311 100644 --- a/lapack-netlib/SRC/zhetri_3.c +++ b/lapack-netlib/SRC/zhetri_3.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetri_3_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetri_3_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -691,7 +691,7 @@ static integer c_n1 = -1; integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int zhetri_3x_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zhetri_3x_(char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); @@ -749,16 +749,16 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRI_3", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } zhetri_3x_(uplo, n, &a[a_offset], lda, &e[1], &ipiv[1], &work[1], &nb, @@ -766,7 +766,7 @@ static integer c_n1 = -1; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZHETRI_3 */ diff --git a/lapack-netlib/SRC/zhetri_3x.c b/lapack-netlib/SRC/zhetri_3x.c index 74e143e198..114ae02d5e 100644 --- a/lapack-netlib/SRC/zhetri_3x.c +++ b/lapack-netlib/SRC/zhetri_3x.c @@ -674,7 +674,7 @@ static doublecomplex c_b2 = {0.,0.}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetri_3x_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetri_3x_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *work, integer *nb, integer *info) { @@ -687,18 +687,18 @@ static doublecomplex c_b2 = {0.,0.}; /* Local variables */ integer invd; doublecomplex akkp1; - extern /* Subroutine */ int zheswapr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zheswapr_(char *, integer *, doublecomplex *, integer *, integer *, integer *); doublecomplex d__; integer i__, j, k; doublereal t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal ak; @@ -752,10 +752,10 @@ static doublecomplex c_b2 = {0.,0.}; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRI_3X", &i__1, (ftnlen)9); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Workspace got Non-diag elements of D */ @@ -776,7 +776,7 @@ static doublecomplex c_b2 = {0.,0.}; for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } } } else { @@ -787,7 +787,7 @@ static doublecomplex c_b2 = {0.,0.}; for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } } } @@ -1429,7 +1429,7 @@ static doublecomplex c_b2 = {0.,0.}; } - return 0; + return; /* End of ZHETRI_3X */ diff --git a/lapack-netlib/SRC/zhetri_rook.c b/lapack-netlib/SRC/zhetri_rook.c index 8ebcb1ba1f..1897f04b31 100644 --- a/lapack-netlib/SRC/zhetri_rook.c +++ b/lapack-netlib/SRC/zhetri_rook.c @@ -643,7 +643,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetri_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zhetri_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *info) { /* System generated locals */ @@ -660,11 +660,11 @@ rook.f"> */ extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal ak; @@ -704,13 +704,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRI_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -722,7 +722,7 @@ rook.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -734,7 +734,7 @@ rook.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -1233,7 +1233,7 @@ rook.f"> */ ; } - return 0; + return; /* End of ZHETRI_ROOK */ diff --git a/lapack-netlib/SRC/zhetrs.c b/lapack-netlib/SRC/zhetrs.c index 0340dd1496..a732bb7f95 100644 --- a/lapack-netlib/SRC/zhetrs.c +++ b/lapack-netlib/SRC/zhetrs.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhetrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -649,17 +649,18 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, bk; integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublecomplex akm1, bkm1; @@ -700,13 +701,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1079,7 +1080,7 @@ f"> */ ; } - return 0; + return; /* End of ZHETRS */ diff --git a/lapack-netlib/SRC/zhetrs2.c b/lapack-netlib/SRC/zhetrs2.c index e196ba95ab..fe49e271c3 100644 --- a/lapack-netlib/SRC/zhetrs2.c +++ b/lapack-netlib/SRC/zhetrs2.c @@ -640,7 +640,7 @@ static doublecomplex c_b1 = {1.,0.}; /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetrs2_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhetrs2_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *info) { @@ -656,16 +656,17 @@ static doublecomplex c_b1 = {1.,0.}; doublecomplex denom; integer iinfo; logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char * , integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, bk; integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublecomplex akm1, bkm1; - extern /* Subroutine */ int zsyconv_(char *, char *, integer *, + extern /* Subroutine */ void zsyconv_(char *, char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *); @@ -705,13 +706,13 @@ static doublecomplex c_b1 = {1.,0.}; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRS2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Convert A */ @@ -938,7 +939,7 @@ static doublecomplex c_b1 = {1.,0.}; zsyconv_(uplo, "R", n, &a[a_offset], lda, &ipiv[1], &work[1], &iinfo); - return 0; + return; /* End of ZHETRS2 */ diff --git a/lapack-netlib/SRC/zhetrs_3.c b/lapack-netlib/SRC/zhetrs_3.c index 4db247a9c4..d8160301e0 100644 --- a/lapack-netlib/SRC/zhetrs_3.c +++ b/lapack-netlib/SRC/zhetrs_3.c @@ -678,7 +678,7 @@ static doublecomplex c_b1 = {1.,0.}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetrs_3_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhetrs_3_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -693,13 +693,14 @@ static doublecomplex c_b1 = {1.,0.}; extern logical lsame_(char *, char *); doublecomplex denom; logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char * , integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, bk; integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublecomplex akm1, bkm1; @@ -740,13 +741,13 @@ static doublecomplex c_b1 = {1.,0.}; if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRS_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -940,7 +941,7 @@ static doublecomplex c_b1 = {1.,0.}; } - return 0; + return; /* End of ZHETRS_3 */ diff --git a/lapack-netlib/SRC/zhetrs_aa.c b/lapack-netlib/SRC/zhetrs_aa.c index 63b4e07b9d..bdebbd6b3e 100644 --- a/lapack-netlib/SRC/zhetrs_aa.c +++ b/lapack-netlib/SRC/zhetrs_aa.c @@ -645,7 +645,7 @@ aa.f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetrs_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhetrs_aa_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -656,14 +656,15 @@ aa.f"> */ integer k; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgtsv_(integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , integer *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer * ); @@ -715,17 +716,17 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRS_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { lwkopt = *n * 3 - 2; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -861,7 +862,7 @@ aa.f"> */ } - return 0; + return; /* End of ZHETRS_AA */ diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.c b/lapack-netlib/SRC/zhetrs_aa_2stage.c index 75879e0ccf..efcebead3e 100644 --- a/lapack-netlib/SRC/zhetrs_aa_2stage.c +++ b/lapack-netlib/SRC/zhetrs_aa_2stage.c @@ -656,7 +656,7 @@ aa_2stage.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhetrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhetrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *tb, integer *ltb, integer *ipiv, integer *ipiv2, doublecomplex *b, integer *ldb, integer *info) @@ -668,13 +668,15 @@ aa_2stage.f"> */ integer ldtb; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgbtrs_( char *, integer *, integer *, integer *, integer *, doublecomplex - *, integer *, integer *, doublecomplex *, integer *, integer *), zlaswp_(integer *, doublecomplex *, integer *, integer *, + *, integer *, integer *, doublecomplex *, integer *, integer *); + extern int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); @@ -718,13 +720,13 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRS_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Read NB and compute LDTB */ @@ -809,7 +811,7 @@ aa_2stage.f"> */ } } - return 0; + return; /* End of ZHETRS_AA_2STAGE */ diff --git a/lapack-netlib/SRC/zhetrs_rook.c b/lapack-netlib/SRC/zhetrs_rook.c index 9ecb2b6644..f6f814ea9c 100644 --- a/lapack-netlib/SRC/zhetrs_rook.c +++ b/lapack-netlib/SRC/zhetrs_rook.c @@ -651,7 +651,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zhetrs_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhetrs_rook_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -665,17 +665,18 @@ rook.f"> */ doublereal s; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, bk; integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublecomplex akm1, bkm1; @@ -716,13 +717,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHETRS_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1117,7 +1118,7 @@ rook.f"> */ ; } - return 0; + return; /* End of ZHETRS_ROOK */ diff --git a/lapack-netlib/SRC/zhfrk.c b/lapack-netlib/SRC/zhfrk.c index 2decb3c4bb..efc567c3ae 100644 --- a/lapack-netlib/SRC/zhfrk.c +++ b/lapack-netlib/SRC/zhfrk.c @@ -676,7 +676,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhfrk_(char *transr, char *uplo, char *trans, integer *n, +/* Subroutine */ void zhfrk_(char *transr, char *uplo, char *trans, integer *n, integer *k, doublereal *alpha, doublecomplex *a, integer *lda, doublereal *beta, doublecomplex *c__) { @@ -689,7 +689,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ doublecomplex cbeta; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, @@ -750,7 +750,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (info != 0) { i__1 = -info; xerbla_("ZHFRK ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ @@ -759,7 +759,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* done (it is in ZHERK for example) and left in the general case. */ if (*n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) { - return 0; + return; } if (*alpha == 0. && *beta == 0.) { @@ -768,7 +768,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ i__2 = j; c__[i__2].r = 0., c__[i__2].i = 0.; } - return 0; + return; } z__1.r = *alpha, z__1.i = 0.; @@ -1081,7 +1081,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of ZHFRK */ diff --git a/lapack-netlib/SRC/zhgeqz.c b/lapack-netlib/SRC/zhgeqz.c index 4cb70385ba..a34d504262 100644 --- a/lapack-netlib/SRC/zhgeqz.c +++ b/lapack-netlib/SRC/zhgeqz.c @@ -799,7 +799,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhgeqz_(char *job, char *compq, char *compz, integer *n, +/* Subroutine */ void zhgeqz_(char *job, char *compq, char *compz, integer *n, integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh, doublecomplex *t, integer *ldt, doublecomplex *alpha, doublecomplex * beta, doublecomplex *q, integer *ldq, doublecomplex *z__, integer * @@ -814,7 +814,7 @@ f"> */ /* Local variables */ doublereal absb, atol, btol, temp; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublereal temp2, c__; integer j; @@ -825,7 +825,7 @@ f"> */ doublereal anorm, bnorm; integer maxit; doublecomplex shift; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); doublereal tempr; doublecomplex ctemp2, ctemp3; @@ -848,10 +848,10 @@ f"> */ doublereal *); logical ilazro; integer icompz, ifirst; - extern /* Subroutine */ int zlartg_(doublecomplex *, doublecomplex *, + extern /* Subroutine */ void zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); integer ifrstm; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer istart; logical lquery; @@ -963,9 +963,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHGEQZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -973,7 +973,7 @@ f"> */ /* WORK( 1 ) = CMPLX( 1 ) */ if (*n <= 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Initialize Q and Z */ @@ -1751,7 +1751,7 @@ f"> */ L210: z__1.r = (doublereal) (*n), z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZHGEQZ */ diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index 302b69f347..c15e7aace4 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -524,9 +524,7 @@ SUBROUTINE ZHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, END IF END IF * - IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( - $ ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 ) - $ ) ) ) ) THEN + IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN T( ILAST, ILAST ) = CZERO GO TO 50 END IF @@ -552,10 +550,7 @@ SUBROUTINE ZHGEQZ( JOB, COMPQ, COMPZ, N, ILO, IHI, H, LDH, T, LDT, * * Test 2: for T(j,j)=0 * - TEMP = ABS ( T( J, J + 1 ) ) - IF ( J .GT. ILO ) - $ TEMP = TEMP + ABS ( T( J - 1, J ) ) - IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN + IF( ABS( T( J, J ) ).LT.BTOL ) THEN T( J, J ) = CZERO * * Test 1a: Check for 2 consecutive small subdiagonals in A diff --git a/lapack-netlib/SRC/zhpcon.c b/lapack-netlib/SRC/zhpcon.c index 80ed469afa..aef8edf974 100644 --- a/lapack-netlib/SRC/zhpcon.c +++ b/lapack-netlib/SRC/zhpcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhpcon_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zhpcon_(char *uplo, integer *n, doublecomplex *ap, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex * work, integer *info) { @@ -643,12 +643,12 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zhptrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhptrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -681,7 +681,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -689,9 +689,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -704,7 +704,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = ip; if (ipiv[i__] > 0 && (ap[i__1].r == 0. && ap[i__1].i == 0.)) { - return 0; + return; } ip -= i__; /* L10: */ @@ -718,7 +718,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = ip; if (ipiv[i__] > 0 && (ap[i__2].r == 0. && ap[i__2].i == 0.)) { - return 0; + return; } ip = ip + *n - i__ + 1; /* L20: */ @@ -744,7 +744,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZHPCON */ diff --git a/lapack-netlib/SRC/zhpev.c b/lapack-netlib/SRC/zhpev.c index a228657ac9..b3f42a19fc 100644 --- a/lapack-netlib/SRC/zhpev.c +++ b/lapack-netlib/SRC/zhpev.c @@ -651,7 +651,7 @@ atrices */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhpev_(char *jobz, char *uplo, integer *n, doublecomplex +/* Subroutine */ void zhpev_(char *jobz, char *uplo, integer *n, doublecomplex *ap, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex * work, doublereal *rwork, integer *info) { @@ -664,7 +664,7 @@ atrices */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -673,17 +673,18 @@ atrices */ extern doublereal dlamch_(char *); integer iscale; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal zlanhp_(char *, char *, integer *, doublecomplex *, doublereal *); integer indrwk, indwrk; doublereal smlnum; - extern /* Subroutine */ int zhptrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhptrd_(char *, integer *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, integer *), zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *), @@ -730,13 +731,13 @@ atrices */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPEV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -746,7 +747,7 @@ atrices */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -806,7 +807,7 @@ atrices */ dscal_(&imax, &d__1, &w[1], &c__1); } - return 0; + return; /* End of ZHPEV */ diff --git a/lapack-netlib/SRC/zhpevd.c b/lapack-netlib/SRC/zhpevd.c index bfcf5a8a08..651d252670 100644 --- a/lapack-netlib/SRC/zhpevd.c +++ b/lapack-netlib/SRC/zhpevd.c @@ -713,7 +713,7 @@ f"> */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhpevd_(char *jobz, char *uplo, integer *n, +/* Subroutine */ void zhpevd_(char *jobz, char *uplo, integer *n, doublecomplex *ap, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal *rwork, integer * lrwork, integer *iwork, integer *liwork, integer *info) @@ -727,7 +727,7 @@ f"> */ doublereal anrm; integer imax; doublereal rmin, rmax; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); @@ -736,24 +736,25 @@ f"> */ extern doublereal dlamch_(char *); integer iscale; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; integer indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *); extern doublereal zlanhp_(char *, char *, integer *, doublecomplex *, doublereal *); - extern /* Subroutine */ int zstedc_(char *, integer *, doublereal *, + extern /* Subroutine */ void zstedc_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *, integer *); integer indrwk, indwrk, liwmin, lrwmin; doublereal smlnum; - extern /* Subroutine */ int zhptrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhptrd_(char *, integer *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, integer *); logical lquery; - extern /* Subroutine */ int zupmtr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zupmtr_(char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal eps; @@ -830,15 +831,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPEVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -847,7 +848,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -913,7 +914,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHPEVD */ diff --git a/lapack-netlib/SRC/zhpevx.c b/lapack-netlib/SRC/zhpevx.c index f2ac4d49fe..289d788909 100644 --- a/lapack-netlib/SRC/zhpevx.c +++ b/lapack-netlib/SRC/zhpevx.c @@ -752,7 +752,7 @@ f"> */ /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhpevx_(char *jobz, char *range, char *uplo, integer *n, +/* Subroutine */ void zhpevx_(char *jobz, char *range, char *uplo, integer *n, doublecomplex *ap, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, doublereal * @@ -769,16 +769,16 @@ f"> */ doublereal rmin, rmax; logical test; integer itmp1, i__, j, indee; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal sigma; extern logical lsame_(char *, char *); integer iinfo; char order[1]; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); logical wantz; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jj; extern doublereal dlamch_(char *); @@ -786,11 +786,12 @@ f"> */ integer iscale, indibl; logical valeig; doublereal safmin; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal abstll, bignum; integer indiwk, indisp, indtau; - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), dstebz_(char *, char *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, @@ -799,7 +800,7 @@ f"> */ doublereal *); integer indrwk, indwrk, nsplit; doublereal smlnum; - extern /* Subroutine */ int zhptrd_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhptrd_(char *, integer *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, integer *), zstein_(integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, doublecomplex *, integer *, @@ -873,14 +874,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPEVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -897,7 +898,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Get machine constants. */ @@ -1066,7 +1067,7 @@ f"> */ } } - return 0; + return; /* End of ZHPEVX */ diff --git a/lapack-netlib/SRC/zhpevx.f b/lapack-netlib/SRC/zhpevx.f index f22e84bd70..0fc80c51a4 100644 --- a/lapack-netlib/SRC/zhpevx.f +++ b/lapack-netlib/SRC/zhpevx.f @@ -264,7 +264,7 @@ SUBROUTINE ZHPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, * .. Local Scalars .. LOGICAL ALLEIG, INDEIG, TEST, VALEIG, WANTZ CHARACTER ORDER - INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL, + INTEGER I, IINFO, IMAX, INDD, INDE, INDEE, $ INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE, $ ITMP1, J, JJ, NSPLIT DOUBLE PRECISION ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN, @@ -434,17 +434,16 @@ SUBROUTINE ZHPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, ELSE ORDER = 'E' END IF - INDIBL = 1 - INDISP = INDIBL + N + INDISP = 1 + N INDIWK = INDISP + N CALL DSTEBZ( RANGE, ORDER, N, VLL, VUU, IL, IU, ABSTLL, $ RWORK( INDD ), RWORK( INDE ), M, NSPLIT, W, - $ IWORK( INDIBL ), IWORK( INDISP ), RWORK( INDRWK ), + $ IWORK( 1 ), IWORK( INDISP ), RWORK( INDRWK ), $ IWORK( INDIWK ), INFO ) * IF( WANTZ ) THEN CALL ZSTEIN( N, RWORK( INDD ), RWORK( INDE ), M, W, - $ IWORK( INDIBL ), IWORK( INDISP ), Z, LDZ, + $ IWORK( 1 ), IWORK( INDISP ), Z, LDZ, $ RWORK( INDRWK ), IWORK( INDIWK ), IFAIL, INFO ) * * Apply unitary matrix used in reduction to tridiagonal @@ -482,11 +481,11 @@ SUBROUTINE ZHPEVX( JOBZ, RANGE, UPLO, N, AP, VL, VU, IL, IU, 30 CONTINUE * IF( I.NE.0 ) THEN - ITMP1 = IWORK( INDIBL+I-1 ) + ITMP1 = IWORK( 1 + I-1 ) W( I ) = W( J ) - IWORK( INDIBL+I-1 ) = IWORK( INDIBL+J-1 ) + IWORK( 1 + I-1 ) = IWORK( 1 + J-1 ) W( J ) = TMP1 - IWORK( INDIBL+J-1 ) = ITMP1 + IWORK( 1 + J-1 ) = ITMP1 CALL ZSWAP( N, Z( 1, I ), 1, Z( 1, J ), 1 ) IF( INFO.NE.0 ) THEN ITMP1 = IFAIL( I ) diff --git a/lapack-netlib/SRC/zhpgst.c b/lapack-netlib/SRC/zhpgst.c index 83ccf19041..55bd84a5e8 100644 --- a/lapack-netlib/SRC/zhpgst.c +++ b/lapack-netlib/SRC/zhpgst.c @@ -627,7 +627,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhpgst_(integer *itype, char *uplo, integer *n, +/* Subroutine */ void zhpgst_(integer *itype, char *uplo, integer *n, doublecomplex *ap, doublecomplex *bp, integer *info) { /* System generated locals */ @@ -636,7 +636,7 @@ f"> */ doublecomplex z__1, z__2, z__3; /* Local variables */ - extern /* Subroutine */ int zhpr2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhpr2_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *); integer j, k; @@ -645,7 +645,7 @@ f"> */ doublecomplex *, integer *, doublecomplex *, integer *); logical upper; integer j1, k1; - extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhpmv_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -654,7 +654,8 @@ f"> */ , doublecomplex *, integer *); integer jj, kk; doublecomplex ct; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal ajj; integer j1j1; @@ -691,7 +692,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPGST", &i__1, (ftnlen)6); - return 0; + return; } if (*itype == 1) { @@ -855,7 +856,7 @@ f"> */ } } } - return 0; + return; /* End of ZHPGST */ diff --git a/lapack-netlib/SRC/zhpgv.c b/lapack-netlib/SRC/zhpgv.c index 84b3c86a74..915cdc5263 100644 --- a/lapack-netlib/SRC/zhpgv.c +++ b/lapack-netlib/SRC/zhpgv.c @@ -677,7 +677,7 @@ static integer c__1 = 1; /* > \ingroup complex16OTHEReigen */ /* ===================================================================== */ -/* Subroutine */ int zhpgv_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void zhpgv_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *ap, doublecomplex *bp, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, doublereal *rwork, integer * info) @@ -690,14 +690,15 @@ static integer c__1 = 1; extern logical lsame_(char *, char *); char trans[1]; logical upper; - extern /* Subroutine */ int zhpev_(char *, char *, integer *, + extern /* Subroutine */ void zhpev_(char *, char *, integer *, doublecomplex *, doublereal *, doublecomplex *, integer *, doublecomplex *, doublereal *, integer *); logical wantz; - extern /* Subroutine */ int ztpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpmv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *), ztpsv_(char *, char *, char *, integer *, doublecomplex * - , doublecomplex *, integer *), xerbla_( - char *, integer *, ftnlen), zhpgst_(integer *, char *, integer *, + , doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zhpgst_(integer *, char *, integer *, doublecomplex *, doublecomplex *, integer *), zpptrf_( char *, integer *, doublecomplex *, integer *); @@ -742,13 +743,13 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZHPGV ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -756,7 +757,7 @@ static integer c__1 = 1; zpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -810,7 +811,7 @@ static integer c__1 = 1; } } } - return 0; + return; /* End of ZHPGV */ diff --git a/lapack-netlib/SRC/zhpgvd.c b/lapack-netlib/SRC/zhpgvd.c index fe28d388c6..41edf776e5 100644 --- a/lapack-netlib/SRC/zhpgvd.c +++ b/lapack-netlib/SRC/zhpgvd.c @@ -743,7 +743,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int zhpgvd_(integer *itype, char *jobz, char *uplo, integer * +/* Subroutine */ void zhpgvd_(integer *itype, char *jobz, char *uplo, integer * n, doublecomplex *ap, doublecomplex *bp, doublereal *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal * rwork, integer *lrwork, integer *iwork, integer *liwork, integer * @@ -759,20 +759,20 @@ f"> */ integer lwmin; char trans[1]; logical upper, wantz; - extern /* Subroutine */ int ztpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpmv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *), ztpsv_(char *, char *, char *, integer *, doublecomplex * - , doublecomplex *, integer *), xerbla_( - char *, integer *, ftnlen); + , doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer liwmin; - extern /* Subroutine */ int zhpevd_(char *, char *, integer *, + extern /* Subroutine */ void zhpevd_(char *, char *, integer *, doublecomplex *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *, integer *, integer *); integer lrwmin; - extern /* Subroutine */ int zhpgst_(integer *, char *, integer *, + extern /* Subroutine */ void zhpgst_(integer *, char *, integer *, doublecomplex *, doublecomplex *, integer *); logical lquery; - extern /* Subroutine */ int zpptrf_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zpptrf_(char *, integer *, doublecomplex *, integer *); @@ -850,15 +850,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPGVD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -866,7 +866,7 @@ f"> */ zpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -933,7 +933,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZHPGVD */ diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f index d27cdc761d..e96e397384 100644 --- a/lapack-netlib/SRC/zhpgvd.f +++ b/lapack-netlib/SRC/zhpgvd.f @@ -335,9 +335,9 @@ SUBROUTINE ZHPGVD( ITYPE, JOBZ, UPLO, N, AP, BP, W, Z, LDZ, WORK, CALL ZHPGST( ITYPE, UPLO, N, AP, BP, INFO ) CALL ZHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK, $ LRWORK, IWORK, LIWORK, INFO ) - LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) - LRWMIN = MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) - LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) + LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) ) + LRWMIN = INT( MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) ) + LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) ) * IF( WANTZ ) THEN * diff --git a/lapack-netlib/SRC/zhpgvx.c b/lapack-netlib/SRC/zhpgvx.c index bb344085ed..a5bc14dfb1 100644 --- a/lapack-netlib/SRC/zhpgvx.c +++ b/lapack-netlib/SRC/zhpgvx.c @@ -788,7 +788,7 @@ f"> */ /* > Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA */ /* ===================================================================== */ -/* Subroutine */ int zhpgvx_(integer *itype, char *jobz, char *range, char * +/* Subroutine */ void zhpgvx_(integer *itype, char *jobz, char *range, char * uplo, integer *n, doublecomplex *ap, doublecomplex *bp, doublereal * vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, integer *ldz, @@ -803,11 +803,12 @@ f"> */ extern logical lsame_(char *, char *); char trans[1]; logical upper, wantz; - extern /* Subroutine */ int ztpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpmv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *), ztpsv_(char *, char *, char *, integer *, doublecomplex * , doublecomplex *, integer *); logical alleig, indeig, valeig; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zhpgst_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zhpgst_( integer *, char *, integer *, doublecomplex *, doublecomplex *, integer *), zhpevx_(char *, char *, char *, integer *, doublecomplex *, doublereal *, doublereal *, integer *, integer *, @@ -879,13 +880,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPGVX", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Form a Cholesky factorization of B. */ @@ -893,7 +894,7 @@ f"> */ zpptrf_(uplo, n, &bp[1], info); if (*info != 0) { *info = *n + *info; - return 0; + return; } /* Transform problem to standard eigenvalue problem and solve. */ @@ -948,7 +949,7 @@ f"> */ } } - return 0; + return; /* End of ZHPGVX */ diff --git a/lapack-netlib/SRC/zhprfs.c b/lapack-netlib/SRC/zhprfs.c index 86cb5472c6..e4dcbafc3b 100644 --- a/lapack-netlib/SRC/zhprfs.c +++ b/lapack-netlib/SRC/zhprfs.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhprfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhprfs_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *afp, integer *ipiv, doublecomplex * b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal *rwork, integer * @@ -712,7 +712,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zhpmv_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_( @@ -726,7 +726,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zhptrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhptrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -774,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1024,7 +1024,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZHPRFS */ diff --git a/lapack-netlib/SRC/zhpsv.c b/lapack-netlib/SRC/zhpsv.c index f59b08e9b5..dd075d9d72 100644 --- a/lapack-netlib/SRC/zhpsv.c +++ b/lapack-netlib/SRC/zhpsv.c @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhpsv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhpsv_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -680,7 +680,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zhptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zhptrf_( char *, integer *, doublecomplex *, integer *, integer *), zhptrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -718,7 +719,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the factorization A = U*D*U**H or A = L*D*L**H. */ @@ -731,7 +732,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zhptrs_(uplo, n, nrhs, &ap[1], &ipiv[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of ZHPSV */ diff --git a/lapack-netlib/SRC/zhpsvx.c b/lapack-netlib/SRC/zhpsvx.c index f845948e44..93af38a19c 100644 --- a/lapack-netlib/SRC/zhpsvx.c +++ b/lapack-netlib/SRC/zhpsvx.c @@ -787,7 +787,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhpsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zhpsvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *ap, doublecomplex *afp, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, doublecomplex * @@ -799,14 +799,14 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); logical nofact; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal zlanhp_(char *, char *, integer *, doublecomplex *, doublereal *); - extern /* Subroutine */ int zhpcon_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhpcon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zhprfs_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, @@ -864,7 +864,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -879,7 +879,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -908,7 +908,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZHPSVX */ diff --git a/lapack-netlib/SRC/zhptrd.c b/lapack-netlib/SRC/zhptrd.c index 3aabfb3486..e14d305a13 100644 --- a/lapack-netlib/SRC/zhptrd.c +++ b/lapack-netlib/SRC/zhptrd.c @@ -665,7 +665,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhptrd_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zhptrd_(char *uplo, integer *n, doublecomplex *ap, doublereal *d__, doublereal *e, doublecomplex *tau, integer *info) { /* System generated locals */ @@ -675,7 +675,7 @@ f"> */ /* Local variables */ doublecomplex taui; - extern /* Subroutine */ int zhpr2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhpr2_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *); integer i__; @@ -685,13 +685,14 @@ f"> */ doublecomplex *, integer *, doublecomplex *, integer *); integer i1; logical upper; - extern /* Subroutine */ int zhpmv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhpmv_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer ii; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_( integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i1i1; @@ -725,13 +726,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPTRD", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } if (upper) { @@ -871,7 +872,7 @@ f"> */ d__[i__1] = ap[i__2].r; } - return 0; + return; /* End of ZHPTRD */ diff --git a/lapack-netlib/SRC/zhptrf.c b/lapack-netlib/SRC/zhptrf.c index 7b259c2a98..31bfd9bc02 100644 --- a/lapack-netlib/SRC/zhptrf.c +++ b/lapack-netlib/SRC/zhptrf.c @@ -672,7 +672,7 @@ f"> */ /* > J. Lewis, Boeing Computer Services Company */ /* ===================================================================== */ -/* Subroutine */ int zhptrf_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zhptrf_(char *uplo, integer *n, doublecomplex *ap, integer *ipiv, integer *info) { /* System generated locals */ @@ -682,7 +682,7 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *); doublereal d__; integer i__, j, k; @@ -692,7 +692,7 @@ f"> */ integer kstep; logical upper; doublereal r1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlapy2_(doublereal *, doublereal *); doublereal d11; @@ -704,7 +704,8 @@ f"> */ doublecomplex wk; integer kx; doublereal tt; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -739,7 +740,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1357,7 +1358,7 @@ f"> */ } L110: - return 0; + return; /* End of ZHPTRF */ diff --git a/lapack-netlib/SRC/zhptri.c b/lapack-netlib/SRC/zhptri.c index 8cfb00456d..67266cb9b8 100644 --- a/lapack-netlib/SRC/zhptri.c +++ b/lapack-netlib/SRC/zhptri.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhptri_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zhptri_(char *uplo, integer *n, doublecomplex *ap, integer *ipiv, doublecomplex *work, integer *info) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ doublecomplex *, integer *, doublecomplex *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zhpmv_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zswap_( @@ -681,13 +681,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -700,7 +700,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = kp; if (ipiv[*info] > 0 && (ap[i__1].r == 0. && ap[i__1].i == 0.)) { - return 0; + return; } kp -= *info; /* L10: */ @@ -714,7 +714,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = kp; if (ipiv[*info] > 0 && (ap[i__2].r == 0. && ap[i__2].i == 0.)) { - return 0; + return; } kp = kp + *n - *info + 1; /* L20: */ @@ -1052,7 +1052,7 @@ f"> */ ; } - return 0; + return; /* End of ZHPTRI */ diff --git a/lapack-netlib/SRC/zhptrs.c b/lapack-netlib/SRC/zhptrs.c index 4c3da08813..e000b267c5 100644 --- a/lapack-netlib/SRC/zhptrs.c +++ b/lapack-netlib/SRC/zhptrs.c @@ -630,7 +630,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zhptrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zhptrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -644,17 +644,18 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, bk; integer kc, kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublecomplex akm1, bkm1; @@ -691,13 +692,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1079,7 +1080,7 @@ f"> */ ; } - return 0; + return; /* End of ZHPTRS */ diff --git a/lapack-netlib/SRC/zhsein.c b/lapack-netlib/SRC/zhsein.c index 16bd334061..a7bc3e15f1 100644 --- a/lapack-netlib/SRC/zhsein.c +++ b/lapack-netlib/SRC/zhsein.c @@ -757,7 +757,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zhsein_(char *side, char *eigsrc, char *initv, logical * +/* Subroutine */ void zhsein_(char *side, char *eigsrc, char *initv, logical * select, integer *n, doublecomplex *h__, integer *ldh, doublecomplex * w, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, integer *mm, integer *m, doublecomplex *work, doublereal *rwork, @@ -781,7 +781,8 @@ f"> */ integer kr, ks; doublecomplex wk; extern logical disnan_(doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlaein_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlaein_( logical *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, integer *); @@ -865,13 +866,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZHSEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set machine-dependent constants. */ @@ -945,7 +946,7 @@ f"> */ rwork[1]); if (disnan_(&hnorm)) { *info = -6; - return 0; + return; } else if (hnorm > 0.) { eps3 = hnorm * ulp; } else { @@ -1022,7 +1023,7 @@ f"> */ /* L100: */ } - return 0; + return; /* End of ZHSEIN */ diff --git a/lapack-netlib/SRC/zhseqr.c b/lapack-netlib/SRC/zhseqr.c index 29ff10ba47..bdf100a32a 100644 --- a/lapack-netlib/SRC/zhseqr.c +++ b/lapack-netlib/SRC/zhseqr.c @@ -816,7 +816,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* ===================================================================== */ -/* Subroutine */ int zhseqr_(char *job, char *compz, integer *n, integer *ilo, +/* Subroutine */ void zhseqr_(char *job, char *compz, integer *n, integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh, doublecomplex *w, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, integer *info) @@ -834,7 +834,7 @@ f"> */ logical initz; doublecomplex workl[49]; logical wantt, wantz; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaqr0_(logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, @@ -843,7 +843,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void zlahqr_(logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, @@ -919,13 +919,13 @@ f"> */ i__1 = -(*info); xerbla_("ZHSEQR", &i__1, (ftnlen)6); - return 0; + return; } else if (*n == 0) { /* ==== Quick return in case N = 0; nothing to do. ==== */ - return 0; + return; } else if (lquery) { @@ -940,7 +940,7 @@ f"> */ d__1 = f2cmax(d__2,d__3); z__1.r = d__1, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } else { @@ -970,7 +970,7 @@ f"> */ i__1 = *ilo; i__2 = *ilo + *ilo * h_dim1; w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i; - return 0; + return; } /* ==== ZLAHQR/ZLAQR0 crossover point ==== */ @@ -1054,6 +1054,6 @@ f"> */ /* ==== End of ZHSEQR ==== */ - return 0; + return; } /* zhseqr_ */ diff --git a/lapack-netlib/SRC/zla_gbamv.c b/lapack-netlib/SRC/zla_gbamv.c index 772ed37dcb..6bab7632d3 100644 --- a/lapack-netlib/SRC/zla_gbamv.c +++ b/lapack-netlib/SRC/zla_gbamv.c @@ -694,7 +694,7 @@ mv.f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_gbamv_(integer *trans, integer *m, integer *n, +/* Subroutine */ void zla_gbamv_(integer *trans, integer *m, integer *n, integer *kl, integer *ku, doublereal *alpha, doublecomplex *ab, integer *ldab, doublecomplex *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) @@ -756,13 +756,13 @@ mv.f"> */ } if (info != 0) { xerbla_("ZLA_GBAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -955,7 +955,7 @@ mv.f"> */ } } - return 0; + return; /* End of ZLA_GBAMV */ diff --git a/lapack-netlib/SRC/zla_gbrcond_c.c b/lapack-netlib/SRC/zla_gbrcond_c.c index 9d4ec9afa2..5996953417 100644 --- a/lapack-netlib/SRC/zla_gbrcond_c.c +++ b/lapack-netlib/SRC/zla_gbrcond_c.c @@ -690,12 +690,12 @@ doublereal zla_gbrcond_c_(char *trans, integer *n, integer *kl, integer *ku, extern logical lsame_(char *, char *); integer isave[3]; doublereal anorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); integer kd, ke; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_gbrcond_x.c b/lapack-netlib/SRC/zla_gbrcond_x.c index 5aeed95f29..b1ffadc645 100644 --- a/lapack-netlib/SRC/zla_gbrcond_x.c +++ b/lapack-netlib/SRC/zla_gbrcond_x.c @@ -683,12 +683,12 @@ doublereal zla_gbrcond_x_(char *trans, integer *n, integer *kl, integer *ku, extern logical lsame_(char *, char *); integer isave[3]; doublereal anorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); integer kd, ke; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.c b/lapack-netlib/SRC/zla_gbrfsx_extended.c index 5853ee7fb5..a662d43442 100644 --- a/lapack-netlib/SRC/zla_gbrfsx_extended.c +++ b/lapack-netlib/SRC/zla_gbrfsx_extended.c @@ -922,7 +922,7 @@ fsx_extended.f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_gbrfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void zla_gbrfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *kl, integer *ku, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex *afb, integer *ldafb, integer *ipiv, logical *colequ, doublereal *c__, doublecomplex *b, @@ -942,15 +942,15 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__, ymin; - extern /* Subroutine */ int zla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void zla_lin_berr_(integer *, integer *, integer * , doublecomplex *, doublereal *, doublereal *); doublereal dxratmax, dzratmax; - extern /* Subroutine */ int blas_zgbmv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_zgbmv_x_(integer *, integer *, integer * , integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer y_prec_state__, i__, j, m; - extern /* Subroutine */ int blas_zgbmv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_zgbmv2_x_(integer *, integer *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), @@ -960,19 +960,19 @@ fsx_extended.f"> */ doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int zgbmv_(char *, integer *, integer *, integer * + extern /* Subroutine */ void zgbmv_(char *, integer *, integer *, integer * , integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); char trans[1]; doublereal normx, normy; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal myhugeval, prev_dz_z__; extern doublereal dlamch_(char *); doublereal yk, final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int zgbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zgbtrs_(char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zla_wwaddw_( integer *, doublecomplex *, doublecomplex *, doublecomplex *); @@ -1022,7 +1022,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1268,6 +1268,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* zla_gbrfsx_extended__ */ diff --git a/lapack-netlib/SRC/zla_geamv.c b/lapack-netlib/SRC/zla_geamv.c index bc89727122..5ad3c22075 100644 --- a/lapack-netlib/SRC/zla_geamv.c +++ b/lapack-netlib/SRC/zla_geamv.c @@ -683,7 +683,7 @@ mv.f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_geamv_(integer *trans, integer *m, integer *n, +/* Subroutine */ void zla_geamv_(integer *trans, integer *m, integer *n, doublereal *alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -739,13 +739,13 @@ mv.f"> */ } if (info != 0) { xerbla_("ZLA_GEAMV ", &info, (ftnlen)10); - return 0; + return; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ @@ -916,7 +916,7 @@ mv.f"> */ } } - return 0; + return; /* End of ZLA_GEAMV */ diff --git a/lapack-netlib/SRC/zla_gercond_c.c b/lapack-netlib/SRC/zla_gercond_c.c index cf253da970..b1a63cd923 100644 --- a/lapack-netlib/SRC/zla_gercond_c.c +++ b/lapack-netlib/SRC/zla_gercond_c.c @@ -670,11 +670,11 @@ doublereal zla_gercond_c_(char *trans, integer *n, doublecomplex *a, integer extern logical lsame_(char *, char *); integer isave[3]; doublereal anorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zgetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_gercond_x.c b/lapack-netlib/SRC/zla_gercond_x.c index a468677066..2b862092ff 100644 --- a/lapack-netlib/SRC/zla_gercond_x.c +++ b/lapack-netlib/SRC/zla_gercond_x.c @@ -662,11 +662,11 @@ doublereal zla_gercond_x_(char *trans, integer *n, doublecomplex *a, integer extern logical lsame_(char *, char *); integer isave[3]; doublereal anorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zgetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.c b/lapack-netlib/SRC/zla_gerfsx_extended.c index 879f79f1cc..34703a163e 100644 --- a/lapack-netlib/SRC/zla_gerfsx_extended.c +++ b/lapack-netlib/SRC/zla_gerfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_gerfsx_extended_(integer *prec_type__, integer * +/* Subroutine */ void zla_gerfsx_extended_(integer *prec_type__, integer * trans_type__, integer *n, integer *nrhs, doublecomplex *a, integer * lda, doublecomplex *af, integer *ldaf, integer *ipiv, logical *colequ, doublereal *c__, doublecomplex *b, integer *ldb, doublecomplex *y, @@ -926,36 +926,36 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__, ymin; - extern /* Subroutine */ int zla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void zla_lin_berr_(integer *, integer *, integer * , doublecomplex *, doublereal *, doublereal *); doublereal dxratmax, dzratmax; - extern /* Subroutine */ int blas_zgemv_x_(integer *, integer *, integer * + extern /* Subroutine */ void blas_zgemv_x_(integer *, integer *, integer * , doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *) ; integer y_prec_state__, i__, j; - extern /* Subroutine */ int blas_zgemv2_x_(integer *, integer *, integer + extern /* Subroutine */ void blas_zgemv2_x_(integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int zla_geamv_(integer *, integer *, integer *, + extern /* Subroutine */ void zla_geamv_(integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *); char trans[1]; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublereal normx, normy; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal myhugeval, prev_dz_z__; extern doublereal dlamch_(char *); doublereal yk, final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int zgetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zgetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zla_wwaddw_(integer *, doublecomplex *, doublecomplex *, doublecomplex *); @@ -1005,7 +1005,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } chla_transtype_(ch__1, trans_type__); *(unsigned char *)trans = *(unsigned char *)&ch__1[0]; @@ -1250,6 +1250,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* zla_gerfsx_extended__ */ diff --git a/lapack-netlib/SRC/zla_heamv.c b/lapack-netlib/SRC/zla_heamv.c index d68cf5b1c7..d76bb993cb 100644 --- a/lapack-netlib/SRC/zla_heamv.c +++ b/lapack-netlib/SRC/zla_heamv.c @@ -687,7 +687,7 @@ mv.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zla_heamv_(integer *uplo, integer *n, doublereal *alpha, +/* Subroutine */ void zla_heamv_(integer *uplo, integer *n, doublereal *alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -740,13 +740,13 @@ mv.f"> */ } if (info != 0) { xerbla_("ZHEMV ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -959,7 +959,7 @@ mv.f"> */ } } - return 0; + return; /* End of ZLA_HEAMV */ diff --git a/lapack-netlib/SRC/zla_hercond_c.c b/lapack-netlib/SRC/zla_hercond_c.c index a986dd2626..bb5e2fe4d5 100644 --- a/lapack-netlib/SRC/zla_hercond_c.c +++ b/lapack-netlib/SRC/zla_hercond_c.c @@ -668,12 +668,12 @@ doublereal zla_hercond_c_(char *uplo, integer *n, doublecomplex *a, integer * integer isave[3]; doublereal anorm; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zhetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_hercond_x.c b/lapack-netlib/SRC/zla_hercond_x.c index 35396ef964..879164bd3e 100644 --- a/lapack-netlib/SRC/zla_hercond_x.c +++ b/lapack-netlib/SRC/zla_hercond_x.c @@ -660,12 +660,12 @@ doublereal zla_hercond_x_(char *uplo, integer *n, doublecomplex *a, integer * integer isave[3]; doublereal anorm; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zhetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_herfsx_extended.c b/lapack-netlib/SRC/zla_herfsx_extended.c index c425141909..2b4145e8ea 100644 --- a/lapack-netlib/SRC/zla_herfsx_extended.c +++ b/lapack-netlib/SRC/zla_herfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_herfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void zla_herfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, integer *ipiv, logical *colequ, doublereal *c__, doublecomplex *b, integer *ldb, doublecomplex *y, @@ -925,31 +925,31 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__, ymin; - extern /* Subroutine */ int zla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void zla_lin_berr_(integer *, integer *, integer * , doublecomplex *, doublereal *, doublereal *); doublereal dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_zhemv_x_(integer *, integer *, + extern /* Subroutine */ void blas_zhemv_x_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *) ; integer uplo2, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int blas_zhemv2_x_(integer *, integer *, + extern /* Subroutine */ void blas_zhemv2_x_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int zla_heamv_(integer *, integer *, doublereal * + extern /* Subroutine */ void zla_heamv_(integer *, integer *, doublereal * , doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *), zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; doublereal normx, normy; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal myhugeval, prev_dz_z__; @@ -957,7 +957,7 @@ fsx_extended.f"> */ doublereal yk; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int zhetrs_(char *, integer *, integer *, + extern /* Subroutine */ void zhetrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zla_wwaddw_(integer *, doublecomplex *, doublecomplex *, doublecomplex *); @@ -1026,7 +1026,7 @@ fsx_extended.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLA_HERFSX_EXTENDED", &i__1, (ftnlen)19); - return 0; + return; } eps = dlamch_("Epsilon"); myhugeval = dlamch_("Overflow"); @@ -1260,6 +1260,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* zla_herfsx_extended__ */ diff --git a/lapack-netlib/SRC/zla_lin_berr.c b/lapack-netlib/SRC/zla_lin_berr.c index c44ab42edb..3a64125e86 100644 --- a/lapack-netlib/SRC/zla_lin_berr.c +++ b/lapack-netlib/SRC/zla_lin_berr.c @@ -610,7 +610,7 @@ _berr.f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_lin_berr_(integer *n, integer *nz, integer *nrhs, +/* Subroutine */ void zla_lin_berr_(integer *n, integer *nz, integer *nrhs, doublecomplex *res, doublereal *ayb, doublereal *berr) { /* System generated locals */ @@ -675,6 +675,6 @@ _berr.f"> */ } } - return 0; + return; } /* zla_lin_berr__ */ diff --git a/lapack-netlib/SRC/zla_porcond_c.c b/lapack-netlib/SRC/zla_porcond_c.c index 3b64bbf0ea..d5c914b400 100644 --- a/lapack-netlib/SRC/zla_porcond_c.c +++ b/lapack-netlib/SRC/zla_porcond_c.c @@ -659,12 +659,12 @@ doublereal zla_porcond_c_(char *uplo, integer *n, doublecomplex *a, integer * integer isave[3]; doublereal anorm; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_porcond_x.c b/lapack-netlib/SRC/zla_porcond_x.c index 98f95157f0..50fe978060 100644 --- a/lapack-netlib/SRC/zla_porcond_x.c +++ b/lapack-netlib/SRC/zla_porcond_x.c @@ -652,12 +652,12 @@ doublereal zla_porcond_x_(char *uplo, integer *n, doublecomplex *a, integer * integer isave[3]; doublereal anorm; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_porfsx_extended.c b/lapack-netlib/SRC/zla_porfsx_extended.c index 2aed207268..47982e6581 100644 --- a/lapack-netlib/SRC/zla_porfsx_extended.c +++ b/lapack-netlib/SRC/zla_porfsx_extended.c @@ -900,7 +900,7 @@ fsx_extended.f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_porfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void zla_porfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, logical *colequ, doublereal *c__, doublecomplex *b, integer *ldb, doublecomplex *y, integer *ldy, @@ -918,36 +918,36 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__, ymin; - extern /* Subroutine */ int zla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void zla_lin_berr_(integer *, integer *, integer * , doublecomplex *, doublereal *, doublereal *); doublereal dxratmax, dzratmax; integer y_prec_state__; - extern /* Subroutine */ int blas_zhemv_x_(integer *, integer *, + extern /* Subroutine */ void blas_zhemv_x_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *) ; integer uplo2, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int blas_zhemv2_x_(integer *, integer *, + extern /* Subroutine */ void blas_zhemv2_x_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal dxrat; logical incr_prec__; doublereal dzrat; - extern /* Subroutine */ int zla_heamv_(integer *, integer *, doublereal * + extern /* Subroutine */ void zla_heamv_(integer *, integer *, doublereal * , doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *), zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublereal normx, normy; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal myhugeval, prev_dz_z__; extern doublereal dlamch_(char *); doublereal yk, final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int zla_wwaddw_(integer *, doublecomplex *, + extern /* Subroutine */ void zla_wwaddw_(integer *, doublecomplex *, doublecomplex *, doublecomplex *), zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -996,7 +996,7 @@ fsx_extended.f"> */ /* Function Body */ if (*info != 0) { - return 0; + return; } eps = dlamch_("Epsilon"); myhugeval = dlamch_("Overflow"); @@ -1229,6 +1229,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* zla_porfsx_extended__ */ diff --git a/lapack-netlib/SRC/zla_syamv.c b/lapack-netlib/SRC/zla_syamv.c index 17ca5cfd3d..f6a0c77840 100644 --- a/lapack-netlib/SRC/zla_syamv.c +++ b/lapack-netlib/SRC/zla_syamv.c @@ -688,7 +688,7 @@ mv.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zla_syamv_(integer *uplo, integer *n, doublereal *alpha, +/* Subroutine */ void zla_syamv_(integer *uplo, integer *n, doublereal *alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { @@ -741,13 +741,13 @@ mv.f"> */ } if (info != 0) { xerbla_("ZLA_SYAMV", &info, (ftnlen)9); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0. && *beta == 1.) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -960,7 +960,7 @@ mv.f"> */ } } - return 0; + return; /* End of ZLA_SYAMV */ diff --git a/lapack-netlib/SRC/zla_syrcond_c.c b/lapack-netlib/SRC/zla_syrcond_c.c index d3beb3a4b4..988a63713f 100644 --- a/lapack-netlib/SRC/zla_syrcond_c.c +++ b/lapack-netlib/SRC/zla_syrcond_c.c @@ -668,12 +668,12 @@ doublereal zla_syrcond_c_(char *uplo, integer *n, doublecomplex *a, integer * integer isave[3]; doublereal anorm; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_syrcond_x.c b/lapack-netlib/SRC/zla_syrcond_x.c index b4c25c2adf..7b771f1444 100644 --- a/lapack-netlib/SRC/zla_syrcond_x.c +++ b/lapack-netlib/SRC/zla_syrcond_x.c @@ -660,12 +660,12 @@ doublereal zla_syrcond_x_(char *uplo, integer *n, doublecomplex *a, integer * integer isave[3]; doublereal anorm; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal tmp; diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.c b/lapack-netlib/SRC/zla_syrfsx_extended.c index fc1dc54917..8e5b40297b 100644 --- a/lapack-netlib/SRC/zla_syrfsx_extended.c +++ b/lapack-netlib/SRC/zla_syrfsx_extended.c @@ -907,7 +907,7 @@ fsx_extended.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_syrfsx_extended_(integer *prec_type__, char *uplo, +/* Subroutine */ void zla_syrfsx_extended_(integer *prec_type__, char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, integer *ipiv, logical *colequ, doublereal *c__, doublecomplex *b, integer *ldb, doublecomplex *y, @@ -925,11 +925,11 @@ fsx_extended.f"> */ /* Local variables */ doublereal dx_x__, dz_z__, ymin; - extern /* Subroutine */ int zla_lin_berr_(integer *, integer *, integer * + extern /* Subroutine */ void zla_lin_berr_(integer *, integer *, integer * , doublecomplex *, doublereal *, doublereal *); doublereal dxratmax, dzratmax; integer y_prec_state__, uplo2, i__, j; - extern /* Subroutine */ int blas_zsymv_x_(integer *, integer *, + extern /* Subroutine */ void blas_zsymv_x_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *) ; @@ -938,16 +938,16 @@ fsx_extended.f"> */ logical incr_prec__; doublereal dzrat; logical upper; - extern /* Subroutine */ int blas_zsymv2_x_(integer *, integer *, + extern /* Subroutine */ void blas_zsymv2_x_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal normx, normy; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal myhugeval, prev_dz_z__; - extern /* Subroutine */ int zla_syamv_(integer *, integer *, doublereal * + extern /* Subroutine */ void zla_syamv_(integer *, integer *, doublereal * , doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *), zsymv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -956,7 +956,7 @@ fsx_extended.f"> */ doublereal yk; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal final_dx_x__, final_dz_z__, normdx; - extern /* Subroutine */ int zla_wwaddw_(integer *, doublecomplex *, + extern /* Subroutine */ void zla_wwaddw_(integer *, doublecomplex *, doublecomplex *, doublecomplex *), zsytrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); @@ -1025,7 +1025,7 @@ fsx_extended.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLA_HERFSX_EXTENDED", &i__1, (ftnlen)19); - return 0; + return; } eps = dlamch_("Epsilon"); myhugeval = dlamch_("Overflow"); @@ -1259,6 +1259,6 @@ fsx_extended.f"> */ } - return 0; + return; } /* zla_syrfsx_extended__ */ diff --git a/lapack-netlib/SRC/zla_wwaddw.c b/lapack-netlib/SRC/zla_wwaddw.c index 9247e16cc7..63a8542b69 100644 --- a/lapack-netlib/SRC/zla_wwaddw.c +++ b/lapack-netlib/SRC/zla_wwaddw.c @@ -590,7 +590,7 @@ ddw.f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zla_wwaddw_(integer *n, doublecomplex *x, doublecomplex +/* Subroutine */ void zla_wwaddw_(integer *n, doublecomplex *x, doublecomplex *y, doublecomplex *w) { /* System generated locals */ @@ -637,6 +637,6 @@ ddw.f"> */ x[i__2].r = s.r, x[i__2].i = s.i; /* L10: */ } - return 0; + return; } /* zla_wwaddw__ */ diff --git a/lapack-netlib/SRC/zlabrd.c b/lapack-netlib/SRC/zlabrd.c index 4655c0e767..208655c58d 100644 --- a/lapack-netlib/SRC/zlabrd.c +++ b/lapack-netlib/SRC/zlabrd.c @@ -726,7 +726,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlabrd_(integer *m, integer *n, integer *nb, +/* Subroutine */ void zlabrd_(integer *m, integer *n, integer *nb, doublecomplex *a, integer *lda, doublereal *d__, doublereal *e, doublecomplex *tauq, doublecomplex *taup, doublecomplex *x, integer * ldx, doublecomplex *y, integer *ldy) @@ -739,7 +739,7 @@ f"> */ /* Local variables */ integer i__; doublecomplex alpha; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), @@ -775,7 +775,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (*m >= *n) { @@ -1070,7 +1070,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of ZLABRD */ diff --git a/lapack-netlib/SRC/zlacgv.c b/lapack-netlib/SRC/zlacgv.c index 0ea47da7de..13809ceb4b 100644 --- a/lapack-netlib/SRC/zlacgv.c +++ b/lapack-netlib/SRC/zlacgv.c @@ -583,7 +583,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlacgv_(integer *n, doublecomplex *x, integer *incx) +/* Subroutine */ void zlacgv_(integer *n, doublecomplex *x, integer *incx) { /* System generated locals */ integer i__1, i__2; @@ -628,7 +628,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of ZLACGV */ diff --git a/lapack-netlib/SRC/zlacn2.c b/lapack-netlib/SRC/zlacn2.c index 0bbd32c897..8984c8bd15 100644 --- a/lapack-netlib/SRC/zlacn2.c +++ b/lapack-netlib/SRC/zlacn2.c @@ -647,7 +647,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlacn2_(integer *n, doublecomplex *v, doublecomplex *x, +/* Subroutine */ void zlacn2_(integer *n, doublecomplex *v, doublecomplex *x, doublereal *est, integer *kase, integer *isave) { /* System generated locals */ @@ -660,7 +660,7 @@ f"> */ integer i__; doublereal absxi; integer jlast; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern integer izmax1_(integer *, doublecomplex *, integer *); extern doublereal dzsum1_(integer *, doublecomplex *, integer *), dlamch_( @@ -695,7 +695,7 @@ f"> */ } *kase = 1; isave[1] = 1; - return 0; + return; } switch (isave[1]) { @@ -736,7 +736,7 @@ f"> */ } *kase = 2; isave[1] = 2; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -758,7 +758,7 @@ f"> */ x[i__1].r = 1., x[i__1].i = 0.; *kase = 1; isave[1] = 3; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -791,7 +791,7 @@ f"> */ } *kase = 2; isave[1] = 4; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 4) */ /* X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -819,7 +819,7 @@ f"> */ } *kase = 1; isave[1] = 5; - return 0; + return; /* ................ ENTRY (ISAVE( 1 ) = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -833,7 +833,7 @@ f"> */ L130: *kase = 0; - return 0; + return; /* End of ZLACN2 */ diff --git a/lapack-netlib/SRC/zlacon.c b/lapack-netlib/SRC/zlacon.c index ac0e9e151f..09aed340f1 100644 --- a/lapack-netlib/SRC/zlacon.c +++ b/lapack-netlib/SRC/zlacon.c @@ -628,7 +628,7 @@ f"> */ /* > ACM Trans. Math. Soft., vol. 14, no. 4, pp. 381-396, December 1988. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlacon_(integer *n, doublecomplex *v, doublecomplex *x, +/* Subroutine */ void zlacon_(integer *n, doublecomplex *v, doublecomplex *x, doublereal *est, integer *kase) { /* System generated locals */ @@ -642,7 +642,7 @@ f"> */ static integer jump, i__, j; static doublereal absxi; static integer jlast; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern integer izmax1_(integer *, doublecomplex *, integer *); extern doublereal dzsum1_(integer *, doublecomplex *, integer *), dlamch_( @@ -676,7 +676,7 @@ f"> */ } *kase = 1; jump = 1; - return 0; + return; } switch (jump) { @@ -717,7 +717,7 @@ f"> */ } *kase = 2; jump = 2; - return 0; + return; /* ................ ENTRY (JUMP = 2) */ /* FIRST ITERATION. X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -739,7 +739,7 @@ f"> */ x[i__1].r = 1., x[i__1].i = 0.; *kase = 1; jump = 3; - return 0; + return; /* ................ ENTRY (JUMP = 3) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -772,7 +772,7 @@ f"> */ } *kase = 2; jump = 4; - return 0; + return; /* ................ ENTRY (JUMP = 4) */ /* X HAS BEEN OVERWRITTEN BY CTRANS(A)*X. */ @@ -800,7 +800,7 @@ f"> */ } *kase = 1; jump = 5; - return 0; + return; /* ................ ENTRY (JUMP = 5) */ /* X HAS BEEN OVERWRITTEN BY A*X. */ @@ -814,7 +814,7 @@ f"> */ L130: *kase = 0; - return 0; + return; /* End of ZLACON */ diff --git a/lapack-netlib/SRC/zlacp2.c b/lapack-netlib/SRC/zlacp2.c index 5dfbf34583..86f6db0c2a 100644 --- a/lapack-netlib/SRC/zlacp2.c +++ b/lapack-netlib/SRC/zlacp2.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlacp2_(char *uplo, integer *m, integer *n, doublereal * +/* Subroutine */ void zlacp2_(char *uplo, integer *m, integer *n, doublereal * a, integer *lda, doublecomplex *b, integer *ldb) { /* System generated locals */ @@ -682,7 +682,7 @@ f"> */ } } - return 0; + return; /* End of ZLACP2 */ diff --git a/lapack-netlib/SRC/zlacpy.c b/lapack-netlib/SRC/zlacpy.c index ed67080162..3db74dbfa4 100644 --- a/lapack-netlib/SRC/zlacpy.c +++ b/lapack-netlib/SRC/zlacpy.c @@ -612,7 +612,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlacpy_(char *uplo, integer *m, integer *n, +/* Subroutine */ void zlacpy_(char *uplo, integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ } } - return 0; + return; /* End of ZLACPY */ diff --git a/lapack-netlib/SRC/zlacrm.c b/lapack-netlib/SRC/zlacrm.c index 4fa719d6db..842ad2bfd8 100644 --- a/lapack-netlib/SRC/zlacrm.c +++ b/lapack-netlib/SRC/zlacrm.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlacrm_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zlacrm_(integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *rwork) { @@ -640,7 +640,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -670,7 +670,7 @@ f"> */ /* Function Body */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -725,7 +725,7 @@ f"> */ /* L80: */ } - return 0; + return; /* End of ZLACRM */ diff --git a/lapack-netlib/SRC/zlacrt.c b/lapack-netlib/SRC/zlacrt.c index 5389b45699..587630d433 100644 --- a/lapack-netlib/SRC/zlacrt.c +++ b/lapack-netlib/SRC/zlacrt.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlacrt_(integer *n, doublecomplex *cx, integer *incx, +/* Subroutine */ void zlacrt_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy, doublecomplex *c__, doublecomplex * s) { @@ -643,7 +643,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (*incx == 1 && *incy == 1) { goto L20; @@ -684,7 +684,7 @@ f"> */ iy += *incy; /* L10: */ } - return 0; + return; /* Code for both increments equal to 1 */ @@ -712,6 +712,6 @@ f"> */ cx[i__2].r = ctemp.r, cx[i__2].i = ctemp.i; /* L30: */ } - return 0; + return; } /* zlacrt_ */ diff --git a/lapack-netlib/SRC/zladiv.c b/lapack-netlib/SRC/zladiv.c index 3780f24fda..82c963534c 100644 --- a/lapack-netlib/SRC/zladiv.c +++ b/lapack-netlib/SRC/zladiv.c @@ -584,7 +584,7 @@ f"> */ /* Local variables */ doublereal zi; - extern /* Subroutine */ int dladiv_(doublereal *, doublereal *, + extern /* Subroutine */ void dladiv_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal zr; diff --git a/lapack-netlib/SRC/zlaed0.c b/lapack-netlib/SRC/zlaed0.c index 37bd12b011..a904235fc2 100644 --- a/lapack-netlib/SRC/zlaed0.c +++ b/lapack-netlib/SRC/zlaed0.c @@ -660,7 +660,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlaed0_(integer *qsiz, integer *n, doublereal *d__, +/* Subroutine */ void zlaed0_(integer *qsiz, integer *n, doublereal *d__, doublereal *e, doublecomplex *q, integer *ldq, doublecomplex *qstore, integer *ldqs, doublereal *rwork, integer *iwork, integer *info) { @@ -671,10 +671,10 @@ f"> */ /* Local variables */ doublereal temp; integer curr, i__, j, k, iperm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer indxq, iwrem, iqptr, tlvls; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaed7_(integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, integer *, doublereal *, @@ -685,11 +685,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlacrm_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zlacrm_(integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, doublecomplex *, integer *, doublereal *); integer igivnm, submat, curprb, subpbs, igivpt; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *); integer curlvl, matsiz, iprmpt, smlsiz, lgn, msd2, smm1, spm1, spm2; @@ -738,13 +738,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAED0", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } smlsiz = ilaenv_(&c__9, "ZLAED0", " ", &c__0, &c__0, &c__0, &c__0, ( @@ -793,10 +793,10 @@ f"> */ temp = log((doublereal) (*n)) / log(2.); lgn = (integer) temp; - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } iprmpt = indxq + *n + 1; @@ -844,7 +844,7 @@ f"> */ ++curr; if (*info > 0) { *info = submat * (*n + 1) + submat + matsiz - 1; - return 0; + return; } k = 1; i__2 = iwork[i__ + 1]; @@ -894,7 +894,7 @@ f"> */ q_dim1 + 1], &rwork[iwrem], &iwork[subpbs + 1], info); if (*info > 0) { *info = submat * (*n + 1) + submat + matsiz - 1; - return 0; + return; } iwork[i__ / 2 + 1] = iwork[i__ + 2]; /* L90: */ @@ -919,7 +919,7 @@ f"> */ } dcopy_(n, &rwork[1], &c__1, &d__[1], &c__1); - return 0; + return; /* End of ZLAED0 */ diff --git a/lapack-netlib/SRC/zlaed7.c b/lapack-netlib/SRC/zlaed7.c index 093051917f..78c21aa2eb 100644 --- a/lapack-netlib/SRC/zlaed7.c +++ b/lapack-netlib/SRC/zlaed7.c @@ -761,7 +761,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlaed7_(integer *n, integer *cutpnt, integer *qsiz, +/* Subroutine */ void zlaed7_(integer *n, integer *cutpnt, integer *qsiz, integer *tlvls, integer *curlvl, integer *curpbm, doublereal *d__, doublecomplex *q, integer *ldq, doublereal *rho, integer *indxq, doublereal *qstore, integer *qptr, integer *prmptr, integer *perm, @@ -773,7 +773,7 @@ f"> */ /* Local variables */ integer indx, curr, i__, k, indxc, indxp, n1, n2; - extern /* Subroutine */ int dlaed9_(integer *, integer *, integer *, + extern /* Subroutine */ void dlaed9_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *), zlaed8_(integer *, integer *, integer *, doublecomplex *, integer @@ -785,8 +785,10 @@ f"> */ doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer idlmda, iq, iw, iz; - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen), zlacrm_(integer *, integer *, doublecomplex *, integer *, + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlacrm_(integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, doublecomplex *, integer *, doublereal * ); integer coltyp, ptr; @@ -838,13 +840,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAED7", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* The following values are for bookkeeping purposes only. They are */ @@ -864,11 +866,11 @@ f"> */ /* Form the z-vector which consists of the last row of Q_1 and the */ /* first row of Q_2. */ - ptr = pow_ii(&c__2, tlvls) + 1; + ptr = pow_ii(c__2, *tlvls) + 1; i__1 = *curlvl - 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = *tlvls - i__; - ptr += pow_ii(&c__2, &i__2); + ptr += pow_ii(c__2, i__2); /* L10: */ } curr = ptr + *curpbm; @@ -906,7 +908,7 @@ f"> */ i__1 = k; qptr[curr + 1] = qptr[curr] + i__1 * i__1; if (*info != 0) { - return 0; + return; } /* Prepare the INDXQ sorting premutation. */ @@ -923,7 +925,7 @@ f"> */ } } - return 0; + return; /* End of ZLAED7 */ diff --git a/lapack-netlib/SRC/zlaed8.c b/lapack-netlib/SRC/zlaed8.c index 1838507f32..54edb64f9d 100644 --- a/lapack-netlib/SRC/zlaed8.c +++ b/lapack-netlib/SRC/zlaed8.c @@ -741,7 +741,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlaed8_(integer *k, integer *n, integer *qsiz, +/* Subroutine */ void zlaed8_(integer *k, integer *n, integer *qsiz, doublecomplex *q, integer *ldq, doublereal *d__, doublereal *rho, integer *cutpnt, doublereal *z__, doublereal *dlamda, doublecomplex * q2, integer *ldq2, doublereal *w, integer *indxp, integer *indx, @@ -757,19 +757,21 @@ f"> */ doublereal c__; integer i__, j; doublereal s, t; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer k2, n1, n2; - extern /* Subroutine */ int zdrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *), zcopy_( integer *, doublecomplex *, integer *, doublecomplex *, integer *) ; extern doublereal dlapy2_(doublereal *, doublereal *), dlamch_(char *); integer jp; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dlamrg_(integer *, integer *, doublereal *, - integer *, integer *, integer *), xerbla_(char *, integer *, ftnlen), zlacpy_(char *, integer *, integer *, doublecomplex *, + extern /* Subroutine */ void dlamrg_(integer *, integer *, doublereal *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer n1p1; doublereal eps, tau, tol; @@ -821,7 +823,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAED8", &i__1, (ftnlen)6); - return 0; + return; } /* Need to initialize GIVPTR to O here in case of quick exit */ @@ -834,7 +836,7 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } n1 = *cutpnt; @@ -900,7 +902,7 @@ f"> */ /* L50: */ } zlacpy_("A", qsiz, n, &q2[q2_dim1 + 1], ldq2, &q[q_dim1 + 1], ldq); - return 0; + return; } /* If there are multiple eigenvalues then the problem deflates. Here */ @@ -1034,7 +1036,7 @@ f"> */ 1) * q_dim1 + 1], ldq); } - return 0; + return; /* End of ZLAED8 */ diff --git a/lapack-netlib/SRC/zlaein.c b/lapack-netlib/SRC/zlaein.c index 49e970e43e..eaa156ecd0 100644 --- a/lapack-netlib/SRC/zlaein.c +++ b/lapack-netlib/SRC/zlaein.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaein_(logical *rightv, logical *noinit, integer *n, +/* Subroutine */ void zlaein_(logical *rightv, logical *noinit, integer *n, doublecomplex *h__, integer *ldh, doublecomplex *w, doublecomplex *v, doublecomplex *b, integer *ldb, doublereal *rwork, doublereal *eps3, doublereal *smlnum, integer *info) @@ -682,7 +682,7 @@ f"> */ doublereal rtemp, rootn, vnorm; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); doublecomplex ei, ej; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *, @@ -690,7 +690,7 @@ f"> */ char normin[1]; extern doublereal dzasum_(integer *, doublecomplex *, integer *); doublereal nrmsml; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *); doublereal growto; @@ -958,7 +958,7 @@ f"> */ d__2))); zdscal_(n, &d__3, &v[1], &c__1); - return 0; + return; /* End of ZLAEIN */ diff --git a/lapack-netlib/SRC/zlaesy.c b/lapack-netlib/SRC/zlaesy.c index 85151d9165..33cc230050 100644 --- a/lapack-netlib/SRC/zlaesy.c +++ b/lapack-netlib/SRC/zlaesy.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complex16SYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaesy_(doublecomplex *a, doublecomplex *b, +/* Subroutine */ void zlaesy_(doublecomplex *a, doublecomplex *b, doublecomplex *c__, doublecomplex *rt1, doublecomplex *rt2, doublecomplex *evscal, doublecomplex *cs1, doublecomplex *sn1) { @@ -753,7 +753,7 @@ f"> */ evscal->r = 0., evscal->i = 0.; } } - return 0; + return; /* End of ZLAESY */ diff --git a/lapack-netlib/SRC/zlaev2.c b/lapack-netlib/SRC/zlaev2.c index 0bcd2fb900..a25e056331 100644 --- a/lapack-netlib/SRC/zlaev2.c +++ b/lapack-netlib/SRC/zlaev2.c @@ -632,7 +632,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaev2_(doublecomplex *a, doublecomplex *b, +/* Subroutine */ void zlaev2_(doublecomplex *a, doublecomplex *b, doublecomplex *c__, doublereal *rt1, doublereal *rt2, doublereal *cs1, doublecomplex *sn1) { @@ -643,7 +643,7 @@ f"> */ /* Local variables */ doublereal t; doublecomplex w; - extern /* Subroutine */ int dlaev2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlaev2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -671,7 +671,7 @@ f"> */ dlaev2_(&d__1, &d__2, &d__3, rt1, rt2, cs1, &t); z__1.r = t * w.r, z__1.i = t * w.i; sn1->r = z__1.r, sn1->i = z__1.i; - return 0; + return; /* End of ZLAEV2 */ diff --git a/lapack-netlib/SRC/zlag2c.c b/lapack-netlib/SRC/zlag2c.c index 82a463d396..60d24d21ce 100644 --- a/lapack-netlib/SRC/zlag2c.c +++ b/lapack-netlib/SRC/zlag2c.c @@ -616,7 +616,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlag2c_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zlag2c_(integer *m, integer *n, doublecomplex *a, integer *lda, complex *sa, integer *ldsa, integer *info) { /* System generated locals */ @@ -667,7 +667,7 @@ f"> */ } *info = 0; L30: - return 0; + return; /* End of ZLAG2C */ diff --git a/lapack-netlib/SRC/zlag2c.f b/lapack-netlib/SRC/zlag2c.f index ba141a98fe..434590bb9c 100644 --- a/lapack-netlib/SRC/zlag2c.f +++ b/lapack-netlib/SRC/zlag2c.f @@ -124,7 +124,7 @@ SUBROUTINE ZLAG2C( M, N, A, LDA, SA, LDSA, INFO ) DOUBLE PRECISION RMAX * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, DIMAG + INTRINSIC DBLE, DIMAG, CMPLX * .. * .. External Functions .. REAL SLAMCH @@ -142,7 +142,7 @@ SUBROUTINE ZLAG2C( M, N, A, LDA, SA, LDSA, INFO ) INFO = 1 GO TO 30 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 10 CONTINUE 20 CONTINUE INFO = 0 diff --git a/lapack-netlib/SRC/zlags2.c b/lapack-netlib/SRC/zlags2.c index c8bc8e9013..9ddde74f6e 100644 --- a/lapack-netlib/SRC/zlags2.c +++ b/lapack-netlib/SRC/zlags2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlags2_(logical *upper, doublereal *a1, doublecomplex * +/* Subroutine */ void zlags2_(logical *upper, doublereal *a1, doublecomplex * a2, doublereal *a3, doublereal *b1, doublecomplex *b2, doublereal *b3, doublereal *csu, doublecomplex *snu, doublereal *csv, doublecomplex * snv, doublereal *csq, doublecomplex *snq) @@ -684,11 +684,11 @@ f"> */ doublereal d__; doublecomplex r__, d1; doublereal s1, s2; - extern /* Subroutine */ int dlasv2_(doublereal *, doublereal *, + extern /* Subroutine */ void dlasv2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal fb, fc; - extern /* Subroutine */ int zlartg_(doublecomplex *, doublecomplex *, + extern /* Subroutine */ void zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); doublecomplex ua11, ua12, ua21, ua22, vb11, vb12, vb21, vb22; doublereal csl, csr, snl, snr; @@ -1032,7 +1032,7 @@ f"> */ } - return 0; + return; /* End of ZLAGS2 */ diff --git a/lapack-netlib/SRC/zlagtm.c b/lapack-netlib/SRC/zlagtm.c index 5607cacb49..c157909045 100644 --- a/lapack-netlib/SRC/zlagtm.c +++ b/lapack-netlib/SRC/zlagtm.c @@ -654,7 +654,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlagtm_(char *trans, integer *n, integer *nrhs, +/* Subroutine */ void zlagtm_(char *trans, integer *n, integer *nrhs, doublereal *alpha, doublecomplex *dl, doublecomplex *d__, doublecomplex *du, doublecomplex *x, integer *ldx, doublereal *beta, doublecomplex *b, integer *ldb) @@ -691,7 +691,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } /* Multiply B by BETA if BETA.NE.1. */ @@ -1161,7 +1161,7 @@ f"> */ } } } - return 0; + return; /* End of ZLAGTM */ diff --git a/lapack-netlib/SRC/zlahef.c b/lapack-netlib/SRC/zlahef.c index 19ba4caa54..466b0ed732 100644 --- a/lapack-netlib/SRC/zlahef.c +++ b/lapack-netlib/SRC/zlahef.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlahef_(char *uplo, integer *n, integer *nb, integer *kb, +/* Subroutine */ void zlahef_(char *uplo, integer *n, integer *nb, integer *kb, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *w, integer *ldw, integer *info) { @@ -705,26 +705,26 @@ f"> */ integer imax, jmax, j, k; doublereal t, alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublereal r1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d21, d22; integer jb, jj, kk, jp, kp; doublereal absakk; integer kw; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) ; extern integer izamax_(integer *, doublecomplex *, integer *); doublereal rowmax; @@ -1733,7 +1733,7 @@ f"> */ *kb = k - 1; } - return 0; + return; /* End of ZLAHEF */ diff --git a/lapack-netlib/SRC/zlahef_aa.c b/lapack-netlib/SRC/zlahef_aa.c index cf2cd3bc44..10b77518a1 100644 --- a/lapack-netlib/SRC/zlahef_aa.c +++ b/lapack-netlib/SRC/zlahef_aa.c @@ -658,7 +658,7 @@ aa.f"> */ /* > \ingroup complex16HEcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlahef_aa_(char *uplo, integer *j1, integer *m, integer +/* Subroutine */ void zlahef_aa_(char *uplo, integer *j1, integer *m, integer *nb, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex * h__, integer *ldh, doublecomplex *work) { @@ -671,21 +671,21 @@ aa.f"> */ integer j, k; doublecomplex alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i1, k1, i2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer mj; - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) ; extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublecomplex piv; @@ -1100,7 +1100,7 @@ aa.f"> */ L40: ; } - return 0; + return; /* End of ZLAHEF_AA */ diff --git a/lapack-netlib/SRC/zlahef_rk.c b/lapack-netlib/SRC/zlahef_rk.c index b1dc03b333..05f455b776 100644 --- a/lapack-netlib/SRC/zlahef_rk.c +++ b/lapack-netlib/SRC/zlahef_rk.c @@ -776,7 +776,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlahef_rk_(char *uplo, integer *n, integer *nb, integer +/* Subroutine */ void zlahef_rk_(char *uplo, integer *n, integer *nb, integer *kb, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *w, integer *ldw, integer *info) { @@ -792,16 +792,16 @@ rk.f"> */ extern logical lsame_(char *, char *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublereal r1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d21, d22; @@ -810,10 +810,10 @@ rk.f"> */ integer kp; doublereal absakk; integer kw; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) ; extern integer izamax_(integer *, doublecomplex *, integer *); doublereal rowmax; @@ -2030,7 +2030,7 @@ rk.f"> */ *kb = k - 1; } - return 0; + return; /* End of ZLAHEF_RK */ diff --git a/lapack-netlib/SRC/zlahef_rook.c b/lapack-netlib/SRC/zlahef_rook.c index 608b1d6626..6172edfd2d 100644 --- a/lapack-netlib/SRC/zlahef_rook.c +++ b/lapack-netlib/SRC/zlahef_rook.c @@ -698,7 +698,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlahef_rook_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void zlahef_rook_(char *uplo, integer *n, integer *nb, integer *kb, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *w, integer *ldw, integer *info) { @@ -714,16 +714,16 @@ rook.f"> */ extern logical lsame_(char *, char *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublereal r1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d21, d22; @@ -732,10 +732,10 @@ rook.f"> */ integer kp; doublereal absakk; integer kw; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal colmax; - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) ; extern integer izamax_(integer *, doublecomplex *, integer *); integer jp1, jp2; @@ -1973,7 +1973,7 @@ rook.f"> */ *kb = k - 1; } - return 0; + return; /* End of ZLAHEF_ROOK */ diff --git a/lapack-netlib/SRC/zlahqr.c b/lapack-netlib/SRC/zlahqr.c index 22089322f9..8436315e0c 100644 --- a/lapack-netlib/SRC/zlahqr.c +++ b/lapack-netlib/SRC/zlahqr.c @@ -709,7 +709,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlahqr_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void zlahqr_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh, doublecomplex *w, integer *iloz, integer *ihiz, doublecomplex *z__, integer *ldz, integer *info) @@ -724,18 +724,18 @@ f"> */ integer i__, j, k, l, m; doublereal s; doublecomplex t, u, v[2], x, y; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer itmax; doublereal rtemp; integer i1, i2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex t1; doublereal t2; doublecomplex v2; doublereal aa, ab, ba, bb; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal h10; doublecomplex h11; doublereal h21; @@ -744,7 +744,7 @@ f"> */ extern doublereal dlamch_(char *); integer nz; doublereal sx, safmin, safmax; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *, doublecomplex *); @@ -781,13 +781,13 @@ f"> */ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*ilo == *ihi) { i__1 = *ilo; i__2 = *ilo + *ilo * h_dim1; w[i__1].r = h__[i__2].r, w[i__1].i = h__[i__2].i; - return 0; + return; } /* ==== clear out the trash ==== */ @@ -1302,7 +1302,7 @@ f"> */ /* Failure to converge in remaining number of iterations */ *info = i__; - return 0; + return; L140: @@ -1318,7 +1318,7 @@ f"> */ goto L30; L150: - return 0; + return; /* End of ZLAHQR */ diff --git a/lapack-netlib/SRC/zlahr2.c b/lapack-netlib/SRC/zlahr2.c index a52036244b..c04c53d91e 100644 --- a/lapack-netlib/SRC/zlahr2.c +++ b/lapack-netlib/SRC/zlahr2.c @@ -699,7 +699,7 @@ f"> */ /* > Mathematical Software, 32(2):180-194, June 2006. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlahr2_(integer *n, integer *k, integer *nb, +/* Subroutine */ void zlahr2_(integer *n, integer *k, integer *nb, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *t, integer *ldt, doublecomplex *y, integer *ldy) { @@ -710,7 +710,7 @@ f"> */ /* Local variables */ integer i__; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -725,7 +725,7 @@ f"> */ doublecomplex *, integer *), ztrmv_(char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ei; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -756,7 +756,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *nb; @@ -899,7 +899,7 @@ f"> */ ztrmm_("RIGHT", "Upper", "NO TRANSPOSE", "NON-UNIT", k, nb, &c_b2, &t[ t_offset], ldt, &y[y_offset], ldy); - return 0; + return; /* End of ZLAHR2 */ diff --git a/lapack-netlib/SRC/zlaic1.c b/lapack-netlib/SRC/zlaic1.c index 9a09702c71..dcb06dc7c2 100644 --- a/lapack-netlib/SRC/zlaic1.c +++ b/lapack-netlib/SRC/zlaic1.c @@ -648,7 +648,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaic1_(integer *job, integer *j, doublecomplex *x, +/* Subroutine */ void zlaic1_(integer *job, integer *j, doublecomplex *x, doublereal *sest, doublecomplex *w, doublecomplex *gamma, doublereal * sestpr, doublecomplex *s, doublecomplex *c__) { @@ -724,7 +724,7 @@ f"> */ c__->r = z__1.r, c__->i = z__1.i; *sestpr = s1 * tmp; } - return 0; + return; } else if (absgam <= eps * absest) { s->r = 1., s->i = 0.; c__->r = 0., c__->i = 0.; @@ -732,7 +732,7 @@ f"> */ s1 = absest / tmp; s2 = absalp / tmp; *sestpr = tmp * sqrt(s1 * s1 + s2 * s2); - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -745,7 +745,7 @@ f"> */ c__->r = 1., c__->i = 0.; *sestpr = s1; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -770,7 +770,7 @@ f"> */ z__1.r = z__2.r / scl, z__1.i = z__2.i / scl; c__->r = z__1.r, c__->i = z__1.i; } - return 0; + return; } else { /* normal case */ @@ -819,7 +819,7 @@ f"> */ z__1.r = cosine.r / tmp, z__1.i = cosine.i / tmp; c__->r = z__1.r, c__->i = z__1.i; *sestpr = sqrt(t + 1.) * absest; - return 0; + return; } } else if (*job == 2) { @@ -860,12 +860,12 @@ f"> */ s->r = z__1.r, s->i = z__1.i; z__1.r = c__->r / tmp, z__1.i = c__->i / tmp; c__->r = z__1.r, c__->i = z__1.i; - return 0; + return; } else if (absgam <= eps * absest) { s->r = 0., s->i = 0.; c__->r = 1., c__->i = 0.; *sestpr = absgam; - return 0; + return; } else if (absalp <= eps * absest) { s1 = absgam; s2 = absest; @@ -878,7 +878,7 @@ f"> */ c__->r = 0., c__->i = 0.; *sestpr = s2; } - return 0; + return; } else if (absest <= eps * absalp || absest <= eps * absgam) { s1 = absgam; s2 = absalp; @@ -909,7 +909,7 @@ f"> */ z__1.r = z__2.r / scl, z__1.i = z__2.i / scl; c__->r = z__1.r, c__->i = z__1.i; } - return 0; + return; } else { /* normal case */ @@ -992,11 +992,11 @@ f"> */ s->r = z__1.r, s->i = z__1.i; z__1.r = cosine.r / tmp, z__1.i = cosine.i / tmp; c__->r = z__1.r, c__->i = z__1.i; - return 0; + return; } } - return 0; + return; /* End of ZLAIC1 */ diff --git a/lapack-netlib/SRC/zlaic1.f b/lapack-netlib/SRC/zlaic1.f index 72948cde9f..47927e7789 100644 --- a/lapack-netlib/SRC/zlaic1.f +++ b/lapack-netlib/SRC/zlaic1.f @@ -348,9 +348,9 @@ SUBROUTINE ZLAIC1( JOB, J, X, SEST, W, GAMMA, SESTPR, S, C ) B = ( ZETA2*ZETA2+ZETA1*ZETA1-ONE )*HALF C = ZETA1*ZETA1 IF( B.GE.ZERO ) THEN - T = -C / ( B+SQRT( B*B+C ) ) + T = DBLE( -C / ( B+SQRT( B*B+C ) ) ) ELSE - T = B - SQRT( B*B+C ) + T = DBLE( B - SQRT( B*B+C ) ) END IF SINE = -( ALPHA / ABSEST ) / T COSINE = -( GAMMA / ABSEST ) / ( ONE+T ) diff --git a/lapack-netlib/SRC/zlals0.c b/lapack-netlib/SRC/zlals0.c index 598cc23a42..3a3dc3fade 100644 --- a/lapack-netlib/SRC/zlals0.c +++ b/lapack-netlib/SRC/zlals0.c @@ -786,7 +786,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int zlals0_(integer *icompq, integer *nl, integer *nr, +/* Subroutine */ void zlals0_(integer *icompq, integer *nl, integer *nr, integer *sqre, integer *nrhs, doublecomplex *b, integer *ldb, doublecomplex *bx, integer *ldbx, integer *perm, integer *givptr, integer *givcol, integer *ldgcol, doublereal *givnum, integer *ldgnum, @@ -808,18 +808,18 @@ f"> */ extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j, m, n; doublereal diflj, difrj, dsigj; - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *); extern doublereal dlamc3_(doublereal *, doublereal *); - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal dj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal dsigjp; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *), zlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublecomplex * , integer *, integer *), zlacpy_(char *, integer *, @@ -893,7 +893,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLALS0", &i__1, (ftnlen)6); - return 0; + return; } m = n + *sqre; @@ -1161,7 +1161,7 @@ f"> */ } } - return 0; + return; /* End of ZLALS0 */ diff --git a/lapack-netlib/SRC/zlalsa.c b/lapack-netlib/SRC/zlalsa.c index d17016e7d5..0452472c6c 100644 --- a/lapack-netlib/SRC/zlalsa.c +++ b/lapack-netlib/SRC/zlalsa.c @@ -779,7 +779,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int zlalsa_(integer *icompq, integer *smlsiz, integer *n, +/* Subroutine */ void zlalsa_(integer *icompq, integer *smlsiz, integer *n, integer *nrhs, doublecomplex *b, integer *ldb, doublecomplex *bx, integer *ldbx, doublereal *u, integer *ldu, doublereal *vt, integer * k, doublereal *difl, doublereal *difr, doublereal *z__, doublereal * @@ -797,11 +797,11 @@ f"> */ /* Local variables */ integer jcol, nlvl, sqre, jrow, i__, j, jimag; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer jreal, inode, ndiml, ndimr, i1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlals0_(integer *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *, @@ -809,9 +809,9 @@ f"> */ doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer ic, lf, nd, ll, nl, nr; - extern /* Subroutine */ int dlasdt_(integer *, integer *, integer *, - integer *, integer *, integer *, integer *), xerbla_(char *, - integer *, ftnlen); + extern /* Subroutine */ void dlasdt_(integer *, integer *, integer *, + integer *, integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer im1, nlf, nrf, lvl, ndb1, nlp1, lvl2, nrp1; @@ -890,7 +890,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLALSA", &i__1, (ftnlen)6); - return 0; + return; } /* Book-keeping and setting up the computation tree. */ @@ -1051,7 +1051,7 @@ f"> */ /* Finally go through the left singular vector matrices of all */ /* the other subproblems bottom-up on the tree. */ - j = pow_ii(&c__2, &nlvl); + j = pow_ii(c__2, nlvl); sqre = 0; for (lvl = nlvl; lvl >= 1; --lvl) { @@ -1065,7 +1065,7 @@ f"> */ ll = 1; } else { i__1 = lvl - 1; - lf = pow_ii(&c__2, &i__1); + lf = pow_ii(c__2, i__1); ll = (lf << 1) - 1; } i__1 = ll; @@ -1110,7 +1110,7 @@ f"> */ ll = 1; } else { i__2 = lvl - 1; - lf = pow_ii(&c__2, &i__2); + lf = pow_ii(c__2, i__2); ll = (lf << 1) - 1; } i__2 = lf; @@ -1270,7 +1270,7 @@ f"> */ L330: - return 0; + return; /* End of ZLALSA */ diff --git a/lapack-netlib/SRC/zlalsd.c b/lapack-netlib/SRC/zlalsd.c index df0842062b..1f2c72a918 100644 --- a/lapack-netlib/SRC/zlalsd.c +++ b/lapack-netlib/SRC/zlalsd.c @@ -703,7 +703,7 @@ f"> */ /* > Osni Marques, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int zlalsd_(char *uplo, integer *smlsiz, integer *n, integer +/* Subroutine */ void zlalsd_(char *uplo, integer *smlsiz, integer *n, integer *nrhs, doublereal *d__, doublereal *e, doublecomplex *b, integer *ldb, doublereal *rcond, integer *rank, doublecomplex *work, doublereal * rwork, integer *iwork, integer *info) @@ -720,18 +720,18 @@ f"> */ k; doublereal r__; integer s, u, jimag; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer z__, jreal, irwib, poles, sizei, irwrb, nsize; - extern /* Subroutine */ int zdrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *), zcopy_( integer *, doublecomplex *, integer *, doublecomplex *, integer *) ; integer irwvt, icmpq1, icmpq2; doublereal cs; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlasda_(integer *, integer *, integer *, + extern /* Subroutine */ void dlasda_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, @@ -739,23 +739,24 @@ f"> */ integer *); integer bx; doublereal sn; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); extern integer idamax_(integer *, doublereal *, integer *); integer st; - extern /* Subroutine */ int dlasdq_(char *, integer *, integer *, integer + extern /* Subroutine */ void dlasdq_(char *, integer *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer vt; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen); + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); integer givcol; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int zlalsa_(integer *, integer *, integer *, + extern /* Subroutine */ void zlalsa_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, @@ -808,7 +809,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLALSD", &i__1, (ftnlen)6); - return 0; + return; } eps = dlamch_("Epsilon"); @@ -826,7 +827,7 @@ f"> */ /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } else if (*n == 1) { if (d__[1] == 0.) { zlaset_("A", &c__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); @@ -836,7 +837,7 @@ f"> */ b_offset], ldb, info); d__[1] = abs(d__[1]); } - return 0; + return; } /* Rotate the matrix if it is lower bidiagonal. */ @@ -879,7 +880,7 @@ f"> */ orgnrm = dlanst_("M", n, &d__[1], &e[1]); if (orgnrm == 0.) { zlaset_("A", n, nrhs, &c_b1, &c_b1, &b[b_offset], ldb); - return 0; + return; } dlascl_("G", &c__0, &c__0, &orgnrm, &c_b10, n, &c__1, &d__[1], n, info); @@ -901,7 +902,7 @@ f"> */ dlasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &rwork[irwvt], n, &rwork[irwu], n, &rwork[irwwrk], &c__1, &rwork[irwwrk], info); if (*info != 0) { - return 0; + return; } /* In the real version, B is passed to DLASDQ and multiplied */ @@ -1026,7 +1027,7 @@ f"> */ zlascl_("G", &c__0, &c__0, &orgnrm, &c_b10, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; } /* Book-keeping and setting up some constants. */ @@ -1127,7 +1128,7 @@ f"> */ rwork[nrwork], &c__1, &rwork[nrwork], info) ; if (*info != 0) { - return 0; + return; } /* In the real version, B is passed to DLASDQ and multiplied */ @@ -1195,7 +1196,7 @@ f"> */ givnum + st1], &rwork[c__ + st1], &rwork[s + st1], & rwork[nrwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } bxst = bx + st1; zlalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, & @@ -1206,7 +1207,7 @@ f"> */ st1], &rwork[givnum + st1], &rwork[c__ + st1], &rwork[ s + st1], &rwork[nrwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } } st = i__ + 1; @@ -1312,7 +1313,7 @@ f"> */ givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &rwork[ nrwork], &iwork[iwk], info); if (*info != 0) { - return 0; + return; } } /* L320: */ @@ -1325,7 +1326,7 @@ f"> */ zlascl_("G", &c__0, &c__0, &orgnrm, &c_b10, n, nrhs, &b[b_offset], ldb, info); - return 0; + return; /* End of ZLALSD */ diff --git a/lapack-netlib/SRC/zlamswlq.c b/lapack-netlib/SRC/zlamswlq.c index 2394ebbdde..af40d70f5d 100644 --- a/lapack-netlib/SRC/zlamswlq.c +++ b/lapack-netlib/SRC/zlamswlq.c @@ -713,7 +713,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlamswlq_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void zlamswlq_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, doublecomplex *a, integer * lda, doublecomplex *t, integer *ldt, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -731,7 +731,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int zgemlqt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemlqt_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztpmlqt_(char *, char *, integer *, integer *, @@ -800,10 +800,10 @@ static integer c__0 = 0; i__1 = -(*info); xerbla_("ZLAMSWLQ", &i__1, (ftnlen)8); work[1].r = (doublereal) lw, work[1].i = 0.; - return 0; + return; } else if (lquery) { work[1].r = (doublereal) lw, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ @@ -811,7 +811,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -819,7 +819,7 @@ static integer c__0 = 0; if (*nb <= *k || *nb >= f2cmax(i__1,*k)) { zgemlqt_(side, trans, m, n, k, mb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && tran) { @@ -962,7 +962,7 @@ static integer c__0 = 0; } work[1].r = (doublereal) lw, work[1].i = 0.; - return 0; + return; /* End of ZLAMSWLQ */ diff --git a/lapack-netlib/SRC/zlamtsqr.c b/lapack-netlib/SRC/zlamtsqr.c index 5c16c87854..cf2fd49e38 100644 --- a/lapack-netlib/SRC/zlamtsqr.c +++ b/lapack-netlib/SRC/zlamtsqr.c @@ -706,7 +706,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlamtsqr_(char *side, char *trans, integer *m, integer * +/* Subroutine */ void zlamtsqr_(char *side, char *trans, integer *m, integer * n, integer *k, integer *mb, integer *nb, doublecomplex *a, integer * lda, doublecomplex *t, integer *ldt, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -716,7 +716,7 @@ static integer c__0 = 0; i__3; /* Local variables */ - extern /* Subroutine */ int ztpmqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ztpmqrt_(char *, char *, integer *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -728,7 +728,7 @@ static integer c__0 = 0; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran, lquery; integer ctr; - extern /* Subroutine */ int zgemqrt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemqrt_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -799,9 +799,9 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("ZLAMTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -809,7 +809,7 @@ static integer c__0 = 0; /* Computing MIN */ i__1 = f2cmin(*m,*n); if (f2cmin(i__1,*k) == 0) { - return 0; + return; } /* Computing MAX */ @@ -817,7 +817,7 @@ static integer c__0 = 0; if (*mb <= *k || *mb >= f2cmax(i__1,*k)) { zgemqrt_(side, trans, m, n, k, nb, &a[a_offset], lda, &t[t_offset], ldt, &c__[c_offset], ldc, &work[1], info); - return 0; + return; } if (left && notran) { @@ -959,7 +959,7 @@ static integer c__0 = 0; } work[1].r = (doublereal) lw, work[1].i = 0.; - return 0; + return; /* End of ZLAMTSQR */ diff --git a/lapack-netlib/SRC/zlangb.c b/lapack-netlib/SRC/zlangb.c index db5a323dc6..6cd82d7ad1 100644 --- a/lapack-netlib/SRC/zlangb.c +++ b/lapack-netlib/SRC/zlangb.c @@ -647,13 +647,13 @@ doublereal zlangb_(char *norm, integer *n, integer *kl, integer *ku, /* Local variables */ doublereal temp; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k, l; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlange.c b/lapack-netlib/SRC/zlange.c index 8166ec0aa1..4d5c755392 100644 --- a/lapack-netlib/SRC/zlange.c +++ b/lapack-netlib/SRC/zlange.c @@ -638,13 +638,13 @@ doublereal zlange_(char *norm, integer *m, integer *n, doublecomplex *a, /* Local variables */ doublereal temp; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlangt.c b/lapack-netlib/SRC/zlangt.c index 98c9a41728..ef6b240a63 100644 --- a/lapack-netlib/SRC/zlangt.c +++ b/lapack-netlib/SRC/zlangt.c @@ -634,7 +634,7 @@ doublereal zlangt_(char *norm, integer *n, doublecomplex *dl, doublecomplex * extern logical lsame_(char *, char *); doublereal anorm; extern logical disnan_(doublereal *); - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum; diff --git a/lapack-netlib/SRC/zlanhb.c b/lapack-netlib/SRC/zlanhb.c index a9ad39a759..9187e2fd9e 100644 --- a/lapack-netlib/SRC/zlanhb.c +++ b/lapack-netlib/SRC/zlanhb.c @@ -654,13 +654,13 @@ doublereal zlanhb_(char *norm, char *uplo, integer *n, integer *k, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, l; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlanhe.c b/lapack-netlib/SRC/zlanhe.c index ed494b948b..4e4392f446 100644 --- a/lapack-netlib/SRC/zlanhe.c +++ b/lapack-netlib/SRC/zlanhe.c @@ -647,13 +647,13 @@ doublereal zlanhe_(char *norm, char *uplo, integer *n, doublecomplex *a, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlanhf.c b/lapack-netlib/SRC/zlanhf.c index dd23ad40ea..cce5b2bd57 100644 --- a/lapack-netlib/SRC/zlanhf.c +++ b/lapack-netlib/SRC/zlanhf.c @@ -776,7 +776,7 @@ doublereal zlanhf_(char *norm, char *transr, char *uplo, integer *n, integer n1; doublereal aa; extern logical disnan_(doublereal *); - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); integer lda, ifm, noe, ilu; diff --git a/lapack-netlib/SRC/zlanhp.c b/lapack-netlib/SRC/zlanhp.c index 52239c1e00..753514afb5 100644 --- a/lapack-netlib/SRC/zlanhp.c +++ b/lapack-netlib/SRC/zlanhp.c @@ -640,13 +640,13 @@ doublereal zlanhp_(char *norm, char *uplo, integer *n, doublecomplex *ap, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlanhs.c b/lapack-netlib/SRC/zlanhs.c index d5de0e08cb..a6d952e98d 100644 --- a/lapack-netlib/SRC/zlanhs.c +++ b/lapack-netlib/SRC/zlanhs.c @@ -631,13 +631,13 @@ doublereal zlanhs_(char *norm, integer *n, doublecomplex *a, integer *lda, doublereal ret_val; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlanht.c b/lapack-netlib/SRC/zlanht.c index c7b2552ead..b47df2e45b 100644 --- a/lapack-netlib/SRC/zlanht.c +++ b/lapack-netlib/SRC/zlanht.c @@ -627,7 +627,7 @@ doublereal zlanht_(char *norm, integer *n, doublereal *d__, doublecomplex *e) extern logical lsame_(char *, char *); doublereal anorm; extern logical disnan_(doublereal *); - extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *), zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum; diff --git a/lapack-netlib/SRC/zlansb.c b/lapack-netlib/SRC/zlansb.c index a012be6a8a..c1291302b2 100644 --- a/lapack-netlib/SRC/zlansb.c +++ b/lapack-netlib/SRC/zlansb.c @@ -652,13 +652,13 @@ doublereal zlansb_(char *norm, char *uplo, integer *n, integer *k, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, l; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlansp.c b/lapack-netlib/SRC/zlansp.c index 3356a767b3..a27b7d730c 100644 --- a/lapack-netlib/SRC/zlansp.c +++ b/lapack-netlib/SRC/zlansp.c @@ -638,13 +638,13 @@ doublereal zlansp_(char *norm, char *uplo, integer *n, doublecomplex *ap, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlansy.c b/lapack-netlib/SRC/zlansy.c index b5d8084376..536511c94c 100644 --- a/lapack-netlib/SRC/zlansy.c +++ b/lapack-netlib/SRC/zlansy.c @@ -646,13 +646,13 @@ doublereal zlansy_(char *norm, char *uplo, integer *n, doublecomplex *a, /* Local variables */ doublereal absa; - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlantb.c b/lapack-netlib/SRC/zlantb.c index 957a993304..076dcc3804 100644 --- a/lapack-netlib/SRC/zlantb.c +++ b/lapack-netlib/SRC/zlantb.c @@ -662,14 +662,14 @@ doublereal zlantb_(char *norm, char *uplo, char *diag, integer *n, integer *k, doublereal ret_val; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, l; logical udiag; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlantp.c b/lapack-netlib/SRC/zlantp.c index c7f8eaafbe..6e504eb69e 100644 --- a/lapack-netlib/SRC/zlantp.c +++ b/lapack-netlib/SRC/zlantp.c @@ -647,14 +647,14 @@ doublereal zlantp_(char *norm, char *uplo, char *diag, integer *n, doublereal ret_val; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j, k; logical udiag; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlantr.c b/lapack-netlib/SRC/zlantr.c index bfd89e6574..778689d9ab 100644 --- a/lapack-netlib/SRC/zlantr.c +++ b/lapack-netlib/SRC/zlantr.c @@ -663,14 +663,14 @@ doublereal zlantr_(char *norm, char *uplo, char *diag, integer *m, integer *n, doublereal ret_val; /* Local variables */ - extern /* Subroutine */ int dcombssq_(doublereal *, doublereal *); + extern /* Subroutine */ void dcombssq_(doublereal *, doublereal *); integer i__, j; logical udiag; extern logical lsame_(char *, char *); doublereal value; extern logical disnan_(doublereal *); doublereal colssq[2]; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal sum, ssq[2]; diff --git a/lapack-netlib/SRC/zlapll.c b/lapack-netlib/SRC/zlapll.c index 9033034185..7eb7598ce1 100644 --- a/lapack-netlib/SRC/zlapll.c +++ b/lapack-netlib/SRC/zlapll.c @@ -609,7 +609,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlapll_(integer *n, doublecomplex *x, integer *incx, +/* Subroutine */ void zlapll_(integer *n, doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, doublereal *ssmin) { /* System generated locals */ @@ -618,16 +618,16 @@ f"> */ doublecomplex z__1, z__2, z__3, z__4; /* Local variables */ - extern /* Subroutine */ int dlas2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlas2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublecomplex c__; extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal ssmax; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex a11, a12, a22; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublecomplex tau; @@ -650,7 +650,7 @@ f"> */ /* Function Body */ if (*n <= 1) { *ssmin = 0.; - return 0; + return; } /* Compute the QR factorization of the N-by-2 matrix ( X Y ) */ @@ -681,7 +681,7 @@ f"> */ d__3 = z_abs(&a22); dlas2_(&d__1, &d__2, &d__3, ssmin, &ssmax); - return 0; + return; /* End of ZLAPLL */ diff --git a/lapack-netlib/SRC/zlapmr.c b/lapack-netlib/SRC/zlapmr.c index 794de8c4a8..3895f65197 100644 --- a/lapack-netlib/SRC/zlapmr.c +++ b/lapack-netlib/SRC/zlapmr.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlapmr_(logical *forwrd, integer *m, integer *n, +/* Subroutine */ void zlapmr_(logical *forwrd, integer *m, integer *n, doublecomplex *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*m <= 1) { - return 0; + return; } i__1 = *m; @@ -735,7 +735,7 @@ f"> */ } - return 0; + return; /* End of ZLAPMT */ diff --git a/lapack-netlib/SRC/zlapmt.c b/lapack-netlib/SRC/zlapmt.c index d3359be4c7..93a398986c 100644 --- a/lapack-netlib/SRC/zlapmt.c +++ b/lapack-netlib/SRC/zlapmt.c @@ -613,7 +613,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlapmt_(logical *forwrd, integer *m, integer *n, +/* Subroutine */ void zlapmt_(logical *forwrd, integer *m, integer *n, doublecomplex *x, integer *ldx, integer *k) { /* System generated locals */ @@ -641,7 +641,7 @@ f"> */ /* Function Body */ if (*n <= 1) { - return 0; + return; } i__1 = *n; @@ -735,7 +735,7 @@ f"> */ } - return 0; + return; /* End of ZLAPMT */ diff --git a/lapack-netlib/SRC/zlaqgb.c b/lapack-netlib/SRC/zlaqgb.c index 0ca7d57ecd..f27b900968 100644 --- a/lapack-netlib/SRC/zlaqgb.c +++ b/lapack-netlib/SRC/zlaqgb.c @@ -669,7 +669,7 @@ f"> */ /* > \ingroup complex16GBauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqgb_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void zlaqgb_(integer *m, integer *n, integer *kl, integer *ku, doublecomplex *ab, integer *ldab, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, char *equed) { @@ -705,7 +705,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -793,7 +793,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of ZLAQGB */ diff --git a/lapack-netlib/SRC/zlaqge.c b/lapack-netlib/SRC/zlaqge.c index f5a8368a9d..3842ed5c91 100644 --- a/lapack-netlib/SRC/zlaqge.c +++ b/lapack-netlib/SRC/zlaqge.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup complex16GEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqge_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zlaqge_(integer *m, integer *n, doublecomplex *a, integer *lda, doublereal *r__, doublereal *c__, doublereal *rowcnd, doublereal *colcnd, doublereal *amax, char *equed) { @@ -688,7 +688,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -764,7 +764,7 @@ f"> */ *(unsigned char *)equed = 'B'; } - return 0; + return; /* End of ZLAQGE */ diff --git a/lapack-netlib/SRC/zlaqhb.c b/lapack-netlib/SRC/zlaqhb.c index 36b4a9f22f..bc31968c00 100644 --- a/lapack-netlib/SRC/zlaqhb.c +++ b/lapack-netlib/SRC/zlaqhb.c @@ -650,7 +650,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqhb_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zlaqhb_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { @@ -687,7 +687,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -756,7 +756,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of ZLAQHB */ diff --git a/lapack-netlib/SRC/zlaqhe.c b/lapack-netlib/SRC/zlaqhe.c index 90deb53d31..9940976380 100644 --- a/lapack-netlib/SRC/zlaqhe.c +++ b/lapack-netlib/SRC/zlaqhe.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complex16HEauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqhe_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zlaqhe_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { @@ -680,7 +680,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -745,7 +745,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of ZLAQHE */ diff --git a/lapack-netlib/SRC/zlaqhp.c b/lapack-netlib/SRC/zlaqhp.c index 3d5ff62787..843d69036a 100644 --- a/lapack-netlib/SRC/zlaqhp.c +++ b/lapack-netlib/SRC/zlaqhp.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqhp_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zlaqhp_(char *uplo, integer *n, doublecomplex *ap, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { /* System generated locals */ @@ -671,7 +671,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -740,7 +740,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of ZLAQHP */ diff --git a/lapack-netlib/SRC/zlaqp2.c b/lapack-netlib/SRC/zlaqp2.c index 5519f75206..a85160bbc0 100644 --- a/lapack-netlib/SRC/zlaqp2.c +++ b/lapack-netlib/SRC/zlaqp2.c @@ -661,7 +661,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int zlaqp2_(integer *m, integer *n, integer *offset, +/* Subroutine */ void zlaqp2_(integer *m, integer *n, integer *offset, doublecomplex *a, integer *lda, integer *jpvt, doublecomplex *tau, doublereal *vn1, doublereal *vn2, doublecomplex *work) { @@ -675,7 +675,7 @@ f"> */ integer i__, j; doublereal tol3z; integer offpi, itemp; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -683,7 +683,7 @@ f"> */ char *); integer mn; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublecomplex aii; integer pvt; @@ -800,7 +800,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of ZLAQP2 */ diff --git a/lapack-netlib/SRC/zlaqps.c b/lapack-netlib/SRC/zlaqps.c index e1f3beaa0e..19fda2a091 100644 --- a/lapack-netlib/SRC/zlaqps.c +++ b/lapack-netlib/SRC/zlaqps.c @@ -692,7 +692,7 @@ f"> */ /* > \endhtmlonly */ /* ===================================================================== */ -/* Subroutine */ int zlaqps_(integer *m, integer *n, integer *offset, integer +/* Subroutine */ void zlaqps_(integer *m, integer *n, integer *offset, integer *nb, integer *kb, doublecomplex *a, integer *lda, integer *jpvt, doublecomplex *tau, doublereal *vn1, doublereal *vn2, doublecomplex * auxv, doublecomplex *f, integer *ldf) @@ -707,7 +707,7 @@ f"> */ integer j, k; doublereal tol3z; integer itemp; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, @@ -720,7 +720,7 @@ f"> */ integer rk; extern integer idamax_(integer *, doublereal *, integer *); integer lsticc; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer lastrk; doublecomplex akk; @@ -939,7 +939,7 @@ f"> */ goto L60; } - return 0; + return; /* End of ZLAQPS */ diff --git a/lapack-netlib/SRC/zlaqr0.c b/lapack-netlib/SRC/zlaqr0.c index 06b58fd1f9..4c8ece3724 100644 --- a/lapack-netlib/SRC/zlaqr0.c +++ b/lapack-netlib/SRC/zlaqr0.c @@ -762,7 +762,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaqr0_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void zlaqr0_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh, doublecomplex *w, integer *iloz, integer *ihiz, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, integer *info) @@ -781,7 +781,7 @@ f"> */ doublereal s; integer itmax, nsmax, nwmax, kwtop; doublecomplex aa, bb, cc, dd; - extern /* Subroutine */ int zlaqr3_(logical *, logical *, integer *, + extern /* Subroutine */ void zlaqr3_(logical *, logical *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, integer *, @@ -802,7 +802,7 @@ f"> */ doublecomplex rtdisc; integer nwupbd; logical sorted; - extern /* Subroutine */ int zlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void zlahqr_(logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, @@ -852,7 +852,7 @@ f"> */ if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (*n <= 15) { @@ -936,7 +936,7 @@ f"> */ d__1 = (doublereal) lwkopt; z__1.r = d__1, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* ==== ZLAHQR/ZLAQR0 crossover point ==== */ @@ -1365,6 +1365,6 @@ f"> */ /* ==== End of ZLAQR0 ==== */ - return 0; + return; } /* zlaqr0_ */ diff --git a/lapack-netlib/SRC/zlaqr1.c b/lapack-netlib/SRC/zlaqr1.c index 2232d286b9..c0b702645c 100644 --- a/lapack-netlib/SRC/zlaqr1.c +++ b/lapack-netlib/SRC/zlaqr1.c @@ -617,7 +617,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaqr1_(integer *n, doublecomplex *h__, integer *ldh, +/* Subroutine */ void zlaqr1_(integer *n, doublecomplex *h__, integer *ldh, doublecomplex *s1, doublecomplex *s2, doublecomplex *v) { /* System generated locals */ @@ -649,7 +649,7 @@ f"> */ /* Function Body */ if (*n != 2 && *n != 3) { - return 0; + return; } if (*n == 2) { @@ -754,6 +754,6 @@ f"> */ v[3].r = z__1.r, v[3].i = z__1.i; } } - return 0; + return; } /* zlaqr1_ */ diff --git a/lapack-netlib/SRC/zlaqr2.c b/lapack-netlib/SRC/zlaqr2.c index e7c352293a..dee1cf49bc 100644 --- a/lapack-netlib/SRC/zlaqr2.c +++ b/lapack-netlib/SRC/zlaqr2.c @@ -786,7 +786,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaqr2_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void zlaqr2_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, doublecomplex *h__, integer *ldh, integer *iloz, integer *ihiz, doublecomplex *z__, integer *ldz, integer *ns, integer *nd, doublecomplex *sh, @@ -804,21 +804,21 @@ f"> */ doublecomplex beta; integer kcol, info, ifst, ilst, ltop, krow, i__, j; doublecomplex s; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer infqr; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kwtop; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); integer jw; doublereal safmin, safmax; - extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void zgehrd_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlahqr_(logical *, @@ -829,11 +829,11 @@ f"> */ zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int ztrexc_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void ztrexc_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); integer lwkopt; - extern /* Subroutine */ int zunmhr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmhr_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ); @@ -908,7 +908,7 @@ f"> */ d__1 = (doublereal) lwkopt; z__1.r = d__1, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* ==== Nothing to do ... */ @@ -917,11 +917,11 @@ f"> */ *nd = 0; work[1].r = 1., work[1].i = 0.; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -968,7 +968,7 @@ f"> */ } } work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1214,6 +1214,6 @@ f"> */ /* ==== End of ZLAQR2 ==== */ - return 0; + return; } /* zlaqr2_ */ diff --git a/lapack-netlib/SRC/zlaqr3.c b/lapack-netlib/SRC/zlaqr3.c index 8ac0b9a29a..a07baaa37c 100644 --- a/lapack-netlib/SRC/zlaqr3.c +++ b/lapack-netlib/SRC/zlaqr3.c @@ -784,7 +784,7 @@ f"> */ /* > University of Kansas, USA */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaqr3_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void zlaqr3_(logical *wantt, logical *wantz, integer *n, integer *ktop, integer *kbot, integer *nw, doublecomplex *h__, integer *ldh, integer *iloz, integer *ihiz, doublecomplex *z__, integer *ldz, integer *ns, integer *nd, doublecomplex *sh, @@ -802,16 +802,16 @@ f"> */ doublecomplex beta; integer kcol, info, nmin, ifst, ilst, ltop, krow, i__, j; doublecomplex s; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer infqr; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kwtop; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *), zlaqr4_(logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, @@ -822,7 +822,7 @@ f"> */ doublereal safmin, safmax; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgehrd_(integer *, integer *, integer *, + extern /* Subroutine */ void zgehrd_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlahqr_(logical *, @@ -833,11 +833,11 @@ f"> */ zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int ztrexc_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void ztrexc_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); integer lwkopt; - extern /* Subroutine */ int zunmhr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmhr_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer * ); @@ -920,7 +920,7 @@ f"> */ d__1 = (doublereal) lwkopt; z__1.r = d__1, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* ==== Nothing to do ... */ @@ -929,11 +929,11 @@ f"> */ *nd = 0; work[1].r = 1., work[1].i = 0.; if (*ktop > *kbot) { - return 0; + return; } /* ... nor for an empty deflation window. ==== */ if (*nw < 1) { - return 0; + return; } /* ==== Machine constants ==== */ @@ -980,7 +980,7 @@ f"> */ } } work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* ==== Convert to spike-triangular form. (In case of a */ @@ -1234,6 +1234,6 @@ f"> */ /* ==== End of ZLAQR3 ==== */ - return 0; + return; } /* zlaqr3_ */ diff --git a/lapack-netlib/SRC/zlaqr4.c b/lapack-netlib/SRC/zlaqr4.c index 999bdc91e8..97be063dce 100644 --- a/lapack-netlib/SRC/zlaqr4.c +++ b/lapack-netlib/SRC/zlaqr4.c @@ -768,7 +768,7 @@ f"> */ /* > of Matrix Analysis, volume 23, pages 948--973, 2002. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaqr4_(logical *wantt, logical *wantz, integer *n, +/* Subroutine */ void zlaqr4_(logical *wantt, logical *wantz, integer *n, integer *ilo, integer *ihi, doublecomplex *h__, integer *ldh, doublecomplex *w, integer *iloz, integer *ihiz, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, integer *info) @@ -787,7 +787,7 @@ f"> */ doublereal s; integer itmax, nsmax, nwmax, kwtop; doublecomplex aa, bb, cc, dd; - extern /* Subroutine */ int zlaqr2_(logical *, logical *, integer *, + extern /* Subroutine */ void zlaqr2_(logical *, logical *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, integer *, @@ -805,7 +805,7 @@ f"> */ doublecomplex rtdisc; integer nwupbd; logical sorted; - extern /* Subroutine */ int zlahqr_(logical *, logical *, integer *, + extern /* Subroutine */ void zlahqr_(logical *, logical *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, @@ -855,7 +855,7 @@ f"> */ if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (*n <= 15) { @@ -939,7 +939,7 @@ f"> */ d__1 = (doublereal) lwkopt; z__1.r = d__1, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* ==== ZLAHQR/ZLAQR0 crossover point ==== */ @@ -1362,6 +1362,6 @@ f"> */ /* ==== End of ZLAQR4 ==== */ - return 0; + return; } /* zlaqr4_ */ diff --git a/lapack-netlib/SRC/zlaqr5.c b/lapack-netlib/SRC/zlaqr5.c index 4086989686..1aacd4c7e0 100644 --- a/lapack-netlib/SRC/zlaqr5.c +++ b/lapack-netlib/SRC/zlaqr5.c @@ -773,7 +773,7 @@ f"> */ /* > ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaqr5_(logical *wantt, logical *wantz, integer *kacc22, +/* Subroutine */ void zlaqr5_(logical *wantt, logical *wantz, integer *kacc22, integer *n, integer *ktop, integer *kbot, integer *nshfts, doublecomplex *s, doublecomplex *h__, integer *ldh, integer *iloz, integer *ihiz, doublecomplex *z__, integer *ldz, doublecomplex *v, @@ -795,24 +795,24 @@ f"> */ doublecomplex alpha; logical accum; integer ndcol, incol, krcol, nbmps; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i2, k1, i4; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublereal h11, h12, h21, h22; - extern /* Subroutine */ int zlaqr1_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlaqr1_(integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *); integer m22; extern doublereal dlamch_(char *); integer ns, nu; doublecomplex vt[3]; doublereal safmin, safmax; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublecomplex refsum; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); @@ -856,14 +856,14 @@ f"> */ /* Function Body */ if (*nshfts < 2) { - return 0; + return; } /* ==== If the active block is empty or 1-by-1, then there */ /* . is nothing to do. ==== */ if (*ktop >= *kbot) { - return 0; + return; } /* ==== NSHFTS is supposed to be even, but if it is odd, */ @@ -1904,6 +1904,6 @@ f"> */ /* ==== End of ZLAQR5 ==== */ - return 0; + return; } /* zlaqr5_ */ diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 3185508bc0..d8c521349e 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -279,7 +279,7 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, PARAMETER ( RZERO = 0.0d0, RONE = 1.0d0 ) * .. * .. Local Scalars .. - COMPLEX*16 ALPHA, BETA, CDUM, REFSUM + COMPLEX*16 ALPHA, BETA, CDUM, REFSUM, T1, T2, T3 DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, @@ -424,12 +424,12 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * ==== Perform update from right within * . computational window. ==== * + T1 = V( 1, M22 ) + T2 = T1*DCONJG( V( 2, M22 ) ) DO 30 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) + REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 30 CONTINUE * * ==== Perform update from left within @@ -442,12 +442,13 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, ELSE JBOT = KBOT END IF + T1 = DCONJG( V( 1, M22 ) ) + T2 = T1*V( 2, M22 ) DO 40 J = K+1, JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + REFSUM = H( K+1, J ) + + $ DCONJG( V( 2, M22 ) )*H( K+2, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 40 CONTINUE * * ==== The following convergence test requires that @@ -532,11 +533,13 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * . Mth bulge. Exploit fact that first two elements * . of row are actually zero. ==== * - REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) - H( K+3, K ) = -REFSUM - H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) ) - H( K+3, K+2 ) = H( K+3, K+2 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) + REFSUM = V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM*T1 + H( K+3, K+1 ) = -REFSUM*T2 + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*T3 * * ==== Calculate reflection to move * . Mth bulge one step. ==== @@ -571,12 +574,13 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ S( 2*M ), VT ) ALPHA = VT( 1 ) CALL ZLARFG( 3, ALPHA, VT( 2 ), 1, VT( 1 ) ) - REFSUM = DCONJG( VT( 1 ) )* - $ ( H( K+1, K )+DCONJG( VT( 2 ) )* - $ H( K+2, K ) ) + T1 = DCONJG( VT( 1 ) ) + T2 = T1*VT( 2 ) + T3 = T1*VT( 3 ) + REFSUM = H( K+1, K )+DCONJG( VT( 2 ) )*H( K+2, K ) * - IF( CABS1( H( K+2, K )-REFSUM*VT( 2 ) )+ - $ CABS1( REFSUM*VT( 3 ) ).GT.ULP* + IF( CABS1( H( K+2, K )-REFSUM*T2 )+ + $ CABS1( REFSUM*T3 ).GT.ULP* $ ( CABS1( H( K, K ) )+CABS1( H( K+1, $ K+1 ) )+CABS1( H( K+2, K+2 ) ) ) ) THEN * @@ -594,7 +598,7 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * . Replace the old reflector with * . the new one. ==== * - H( K+1, K ) = H( K+1, K ) - REFSUM + H( K+1, K ) = H( K+1, K ) - REFSUM*T1 H( K+2, K ) = ZERO H( K+3, K ) = ZERO V( 1, M ) = VT( 1 ) @@ -610,25 +614,29 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * . deflation check. We still delay most of the * . updates from the left for efficiency. ==== * + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 70 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 ) + $ + V( 3, M )*H( J, K+3 ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1 + H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2 + H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3 70 CONTINUE * * ==== Perform update from left for subsequent * . column. ==== * - REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) - $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) - $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) - H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM - H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) - H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) + REFSUM = H( K+1, K+1 ) + $ + DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ + DCONJG( V( 3, M ) )*H( K+3, K+1 ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1 + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2 + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3 * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -688,13 +696,15 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * DO 100 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = DCONJG( V( 1, M ) ) + T2 = T1*V( 2, M ) + T3 = T1*V( 3, M ) DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + REFSUM = H( K+1, J ) + DCONJG( V( 2, M ) )*H( K+2, J ) + $ + DCONJG( V( 3, M ) )*H( K+3, J ) + H( K+1, J ) = H( K+1, J ) - REFSUM*T1 + H( K+2, J ) = H( K+2, J ) - REFSUM*T2 + H( K+3, J ) = H( K+3, J ) - REFSUM*T3 90 CONTINUE 100 CONTINUE * @@ -712,14 +722,15 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, I2 = MAX( 1, KTOP-INCOL ) I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 110 J = I2, I4 - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 ) + $ + V( 3, M )*U( J, KMS+3 ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1 + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2 + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3 110 CONTINUE 120 CONTINUE ELSE IF( WANTZ ) THEN @@ -730,14 +741,15 @@ SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, * DO 140 M = MBOT, MTOP, -1 K = KRCOL + 2*( M-1 ) + T1 = V( 1, M ) + T2 = T1*DCONJG( V( 2, M ) ) + T3 = T1*DCONJG( V( 3, M ) ) DO 130 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) + REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 ) + $ + V( 3, M )*Z( J, K+3 ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1 + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2 + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3 130 CONTINUE 140 CONTINUE END IF diff --git a/lapack-netlib/SRC/zlaqsb.c b/lapack-netlib/SRC/zlaqsb.c index 42622920ff..eac5347541 100644 --- a/lapack-netlib/SRC/zlaqsb.c +++ b/lapack-netlib/SRC/zlaqsb.c @@ -650,7 +650,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqsb_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zlaqsb_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { @@ -687,7 +687,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -748,7 +748,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of ZLAQSB */ diff --git a/lapack-netlib/SRC/zlaqsp.c b/lapack-netlib/SRC/zlaqsp.c index 7db6a0d61f..22758cefa7 100644 --- a/lapack-netlib/SRC/zlaqsp.c +++ b/lapack-netlib/SRC/zlaqsp.c @@ -636,7 +636,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqsp_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zlaqsp_(char *uplo, integer *n, doublecomplex *ap, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { /* System generated locals */ @@ -672,7 +672,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -733,7 +733,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of ZLAQSP */ diff --git a/lapack-netlib/SRC/zlaqsy.c b/lapack-netlib/SRC/zlaqsy.c index 6ac767d913..8089d2eb67 100644 --- a/lapack-netlib/SRC/zlaqsy.c +++ b/lapack-netlib/SRC/zlaqsy.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complex16SYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaqsy_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zlaqsy_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, char *equed) { @@ -680,7 +680,7 @@ f"> */ /* Function Body */ if (*n <= 0) { *(unsigned char *)equed = 'N'; - return 0; + return; } /* Initialize LARGE and SMALL. */ @@ -737,7 +737,7 @@ f"> */ *(unsigned char *)equed = 'Y'; } - return 0; + return; /* End of ZLAQSY */ diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f index 2616f20b5b..3e20200ed4 100644 --- a/lapack-netlib/SRC/zlaqz0.f +++ b/lapack-netlib/SRC/zlaqz0.f @@ -300,7 +300,8 @@ RECURSIVE SUBROUTINE ZLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, PARAMETER( ZERO = 0.0D0, ONE = 1.0D0, HALF = 0.5D0 ) * Local scalars - DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR + DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, + $ BNORM, BTOL COMPLEX*16 :: ESHIFT, S1, TEMP INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS, $ NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED, @@ -313,7 +314,7 @@ RECURSIVE SUBROUTINE ZLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * External Functions EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD, $ ZLARTG, ZROT - DOUBLE PRECISION, EXTERNAL :: DLAMCH + DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS LOGICAL, EXTERNAL :: LSAME INTEGER, EXTERNAL :: ILAENV @@ -467,6 +468,9 @@ RECURSIVE SUBROUTINE ZLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, ULP = DLAMCH( 'PRECISION' ) SMLNUM = SAFMIN*( DBLE( N )/ULP ) + BNORM = ZLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK ) + BTOL = MAX( SAFMIN, ULP*BNORM ) + ISTART = ILO ISTOP = IHI MAXIT = 30*( IHI-ILO+1 ) @@ -529,15 +533,8 @@ RECURSIVE SUBROUTINE ZLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, * slow down the method when many infinite eigenvalues are present K = ISTOP DO WHILE ( K.GE.ISTART2 ) - TEMPR = ZERO - IF( K .LT. ISTOP ) THEN - TEMPR = TEMPR+ABS( B( K, K+1 ) ) - END IF - IF( K .GT. ISTART2 ) THEN - TEMPR = TEMPR+ABS( B( K-1, K ) ) - END IF - IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN + IF( ABS( B( K, K ) ) .LT. BTOL ) THEN * A diagonal element of B is negligable, move it * to the top and deflate it @@ -649,7 +646,7 @@ RECURSIVE SUBROUTINE ZLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A, NS = MIN( NSHIFTS, ISTOP-ISTART2 ) NS = MIN( NS, N_UNDEFLATED ) - SHIFTPOS = ISTOP-N_DEFLATED-N_UNDEFLATED+1 + SHIFTPOS = ISTOP-N_UNDEFLATED+1 IF ( MOD( LD, 6 ) .EQ. 0 ) THEN * diff --git a/lapack-netlib/SRC/zlar1v.c b/lapack-netlib/SRC/zlar1v.c index 0d13d45b12..fa9c6147a2 100644 --- a/lapack-netlib/SRC/zlar1v.c +++ b/lapack-netlib/SRC/zlar1v.c @@ -738,7 +738,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int zlar1v_(integer *n, integer *b1, integer *bn, doublereal +/* Subroutine */ void zlar1v_(integer *n, integer *b1, integer *bn, doublereal *lambda, doublereal *d__, doublereal *l, doublereal *ld, doublereal * lld, doublereal *pivmin, doublereal *gaptol, doublecomplex *z__, logical *wantnc, integer *negcnt, doublereal *ztz, doublereal *mingma, @@ -1091,7 +1091,7 @@ f"> */ *rqcorr = *mingma * tmp; - return 0; + return; /* End of ZLAR1V */ diff --git a/lapack-netlib/SRC/zlar2v.c b/lapack-netlib/SRC/zlar2v.c index 0b03b91d1b..8654cdde45 100644 --- a/lapack-netlib/SRC/zlar2v.c +++ b/lapack-netlib/SRC/zlar2v.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlar2v_(integer *n, doublecomplex *x, doublecomplex *y, +/* Subroutine */ void zlar2v_(integer *n, doublecomplex *x, doublecomplex *y, doublecomplex *z__, integer *incx, doublereal *c__, doublecomplex *s, integer *incc) { @@ -709,7 +709,7 @@ f"> */ ic += *incc; /* L10: */ } - return 0; + return; /* End of ZLAR2V */ diff --git a/lapack-netlib/SRC/zlarcm.c b/lapack-netlib/SRC/zlarcm.c index c2f784dd81..5ce71fdf97 100644 --- a/lapack-netlib/SRC/zlarcm.c +++ b/lapack-netlib/SRC/zlarcm.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlarcm_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void zlarcm_(integer *m, integer *n, doublereal *a, integer * lda, doublecomplex *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *rwork) { @@ -640,7 +640,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); @@ -670,7 +670,7 @@ f"> */ /* Function Body */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -725,7 +725,7 @@ f"> */ /* L80: */ } - return 0; + return; /* End of ZLARCM */ diff --git a/lapack-netlib/SRC/zlarf.c b/lapack-netlib/SRC/zlarf.c index 8269b34991..67f8c9756a 100644 --- a/lapack-netlib/SRC/zlarf.c +++ b/lapack-netlib/SRC/zlarf.c @@ -643,7 +643,7 @@ static integer c__1 = 1; /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlarf_(char *side, integer *m, integer *n, doublecomplex +/* Subroutine */ void zlarf_(char *side, integer *m, integer *n, doublecomplex *v, integer *incv, doublecomplex *tau, doublecomplex *c__, integer * ldc, doublecomplex *work) { @@ -655,7 +655,7 @@ static integer c__1 = 1; integer i__; extern logical lsame_(char *, char *); integer lastc; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -752,7 +752,7 @@ static integer c__1 = 1; c_offset], ldc); } } - return 0; + return; /* End of ZLARF */ diff --git a/lapack-netlib/SRC/zlarfb.c b/lapack-netlib/SRC/zlarfb.c index bb1bfea3b5..14d36bc1bd 100644 --- a/lapack-netlib/SRC/zlarfb.c +++ b/lapack-netlib/SRC/zlarfb.c @@ -710,7 +710,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void zlarfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *c__, integer * ldc, doublecomplex *work, integer *ldwork) @@ -723,7 +723,7 @@ f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, @@ -761,7 +761,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } if (lsame_(trans, "N")) { @@ -1382,7 +1382,7 @@ f"> */ } } - return 0; + return; /* End of ZLARFB */ diff --git a/lapack-netlib/SRC/zlarfb_gett.c b/lapack-netlib/SRC/zlarfb_gett.c index 14f4bd66dc..71f4a749db 100644 --- a/lapack-netlib/SRC/zlarfb_gett.c +++ b/lapack-netlib/SRC/zlarfb_gett.c @@ -903,7 +903,7 @@ gett.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarfb_gett_(char *ident, integer *m, integer *n, +/* Subroutine */ void zlarfb_gett_(char *ident, integer *m, integer *n, integer *k, doublecomplex *t, integer *ldt, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, integer * ldwork) @@ -916,7 +916,7 @@ gett.f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, @@ -952,7 +952,7 @@ gett.f"> */ /* Function Body */ if (*m < 0 || *n <= 0 || *k == 0 || *k > *n) { - return 0; + return; } lnotident = ! lsame_(ident, "I"); @@ -1149,7 +1149,7 @@ gett.f"> */ } } - return 0; + return; /* End of ZLARFB_GETT */ diff --git a/lapack-netlib/SRC/zlarfg.c b/lapack-netlib/SRC/zlarfg.c index 3e1171350c..9696b3ef4a 100644 --- a/lapack-netlib/SRC/zlarfg.c +++ b/lapack-netlib/SRC/zlarfg.c @@ -619,7 +619,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlarfg_(integer *n, doublecomplex *alpha, doublecomplex * +/* Subroutine */ void zlarfg_(integer *n, doublecomplex *alpha, doublecomplex * x, integer *incx, doublecomplex *tau) { /* System generated locals */ @@ -631,13 +631,13 @@ f"> */ doublereal beta; integer j; doublereal alphi, alphr; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); doublereal xnorm; extern doublereal dlapy3_(doublereal *, doublereal *, doublereal *), dznrm2_(integer *, doublecomplex *, integer *), dlamch_(char *); doublereal safmin; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal rsafmn; extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *, @@ -660,7 +660,7 @@ f"> */ /* Function Body */ if (*n <= 0) { tau->r = 0., tau->i = 0.; - return 0; + return; } i__1 = *n - 1; @@ -727,7 +727,7 @@ f"> */ alpha->r = beta, alpha->i = 0.; } - return 0; + return; /* End of ZLARFG */ diff --git a/lapack-netlib/SRC/zlarfgp.c b/lapack-netlib/SRC/zlarfgp.c index e8ae915923..00d3f2042a 100644 --- a/lapack-netlib/SRC/zlarfgp.c +++ b/lapack-netlib/SRC/zlarfgp.c @@ -617,7 +617,7 @@ static doublecomplex c_b5 = {1.,0.}; /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlarfgp_(integer *n, doublecomplex *alpha, doublecomplex +/* Subroutine */ void zlarfgp_(integer *n, doublecomplex *alpha, doublecomplex *x, integer *incx, doublecomplex *tau) { /* System generated locals */ @@ -629,14 +629,14 @@ static doublecomplex c_b5 = {1.,0.}; doublereal beta; integer j; doublereal alphi, alphr; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); doublecomplex savealpha; doublereal xnorm; extern doublereal dlapy2_(doublereal *, doublereal *), dlapy3_(doublereal *, doublereal *, doublereal *), dznrm2_(integer *, doublecomplex * , integer *), dlamch_(char *); - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *, @@ -660,7 +660,7 @@ static doublecomplex c_b5 = {1.,0.}; /* Function Body */ if (*n <= 0) { tau->r = 0., tau->i = 0.; - return 0; + return; } i__1 = *n - 1; @@ -817,7 +817,7 @@ static doublecomplex c_b5 = {1.,0.}; alpha->r = beta, alpha->i = 0.; } - return 0; + return; /* End of ZLARFGP */ diff --git a/lapack-netlib/SRC/zlarft.c b/lapack-netlib/SRC/zlarft.c index e6a90565a4..f24e62d380 100644 --- a/lapack-netlib/SRC/zlarft.c +++ b/lapack-netlib/SRC/zlarft.c @@ -677,7 +677,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarft_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void zlarft_(char *direct, char *storev, integer *n, integer * k, doublecomplex *v, integer *ldv, doublecomplex *tau, doublecomplex * t, integer *ldt) { @@ -688,17 +688,17 @@ f"> */ /* Local variables */ integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lastv; - extern /* Subroutine */ int ztrmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztrmv_(char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer prevlastv; - extern /* Subroutine */ int mecago_(); + extern /* Subroutine */ void mecago_(); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -723,7 +723,7 @@ f"> */ /* Function Body */ if (*n == 0) { - return 0; + return; } if (lsame_(direct, "F")) { @@ -925,7 +925,7 @@ f"> */ } } } - return 0; + return; /* End of ZLARFT */ diff --git a/lapack-netlib/SRC/zlarfx.c b/lapack-netlib/SRC/zlarfx.c index 84944c44f9..782699a663 100644 --- a/lapack-netlib/SRC/zlarfx.c +++ b/lapack-netlib/SRC/zlarfx.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlarfx_(char *side, integer *m, integer *n, +/* Subroutine */ void zlarfx_(char *side, integer *m, integer *n, doublecomplex *v, doublecomplex *tau, doublecomplex *c__, integer * ldc, doublecomplex *work) { @@ -646,7 +646,7 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublecomplex t1, t2, t3, t4, t5, t6, t7, t8, t9, v1, v2, v3, v4, v5, v6, @@ -671,7 +671,7 @@ f"> */ /* Function Body */ if (tau->r == 0. && tau->i == 0.) { - return 0; + return; } if (lsame_(side, "L")) { @@ -2595,7 +2595,7 @@ f"> */ goto L410; } L410: - return 0; + return; /* End of ZLARFX */ diff --git a/lapack-netlib/SRC/zlarfy.c b/lapack-netlib/SRC/zlarfy.c index 14675aa2ec..e5df36f34b 100644 --- a/lapack-netlib/SRC/zlarfy.c +++ b/lapack-netlib/SRC/zlarfy.c @@ -620,7 +620,7 @@ static integer c__1 = 1; /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlarfy_(char *uplo, integer *n, doublecomplex *v, +/* Subroutine */ void zlarfy_(char *uplo, integer *n, doublecomplex *v, integer *incv, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work) { @@ -629,13 +629,13 @@ static integer c__1 = 1; doublecomplex z__1, z__2, z__3, z__4; /* Local variables */ - extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zher2_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex alpha; extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_( integer *, doublecomplex *, doublecomplex *, integer *, @@ -660,7 +660,7 @@ static integer c__1 = 1; /* Function Body */ if (tau->r == 0. && tau->i == 0.) { - return 0; + return; } /* Form w:= C * v */ @@ -682,7 +682,7 @@ static integer c__1 = 1; z__1.r = -tau->r, z__1.i = -tau->i; zher2_(uplo, n, &z__1, &v[1], incv, &work[1], &c__1, &c__[c_offset], ldc); - return 0; + return; /* End of ZLARFY */ diff --git a/lapack-netlib/SRC/zlargv.c b/lapack-netlib/SRC/zlargv.c index ac437252da..2fc7080e2f 100644 --- a/lapack-netlib/SRC/zlargv.c +++ b/lapack-netlib/SRC/zlargv.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlargv_(integer *n, doublecomplex *x, integer *incx, +/* Subroutine */ void zlargv_(integer *n, doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, doublereal *c__, integer *incc) { /* System generated locals */ @@ -870,7 +870,7 @@ f"> */ ix += *incx; /* L60: */ } - return 0; + return; /* End of ZLARGV */ diff --git a/lapack-netlib/SRC/zlarnv.c b/lapack-netlib/SRC/zlarnv.c index 3da6ebf742..9b7d4b173d 100644 --- a/lapack-netlib/SRC/zlarnv.c +++ b/lapack-netlib/SRC/zlarnv.c @@ -608,7 +608,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarnv_(integer *idist, integer *iseed, integer *n, +/* Subroutine */ void zlarnv_(integer *idist, integer *iseed, integer *n, doublecomplex *x) { /* System generated locals */ @@ -620,7 +620,7 @@ f"> */ integer i__; doublereal u[128]; integer il, iv; - extern /* Subroutine */ int dlaruv_(integer *, integer *, doublereal *); + extern /* Subroutine */ void dlaruv_(integer *, integer *, doublereal *); /* -- LAPACK auxiliary routine (version 3.7.0) -- */ @@ -723,7 +723,7 @@ f"> */ } /* L60: */ } - return 0; + return; /* End of ZLARNV */ diff --git a/lapack-netlib/SRC/zlarrv.c b/lapack-netlib/SRC/zlarrv.c index 7960a3bbdb..8dda3d097c 100644 --- a/lapack-netlib/SRC/zlarrv.c +++ b/lapack-netlib/SRC/zlarrv.c @@ -799,7 +799,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int zlarrv_(integer *n, doublereal *vl, doublereal *vu, +/* Subroutine */ void zlarrv_(integer *n, doublereal *vl, doublereal *vu, doublereal *d__, doublereal *l, doublereal *pivmin, integer *isplit, integer *m, integer *dol, integer *dou, doublereal *minrgp, doublereal *rtol1, doublereal *rtol2, doublereal *w, doublereal *werr, @@ -828,13 +828,13 @@ f"> */ doublereal resid; logical eskip; doublereal right; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer nclus, zfrom; doublereal rqtol; integer iindc1, iindc2, indin1, indin2, miniwsize; logical stp2ii; - extern /* Subroutine */ int zlar1v_(integer *, integer *, integer *, + extern /* Subroutine */ void zlar1v_(integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublecomplex *, logical *, integer *, doublereal *, doublereal *, integer *, @@ -850,7 +850,7 @@ f"> */ logical needbs; integer indlld; doublereal sgndef, mingma; - extern /* Subroutine */ int dlarrb_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrb_(integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *, integer *); @@ -860,7 +860,7 @@ f"> */ doublereal savgap; integer ndepth; doublereal ssigma; - extern /* Subroutine */ int dlarrf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrf_(integer *, doublereal *, doublereal *, doublereal *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, @@ -868,7 +868,7 @@ f"> */ logical usedbs; integer iindwk, offset; doublereal gaptol; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); integer newcls, oldfst, indwrk, windex, oldlst; logical usedrq; @@ -879,7 +879,7 @@ f"> */ logical tryrqc; integer isupmx; doublereal rqcorr; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal gap, eps, tau, tol, tmp; integer zto; @@ -917,7 +917,7 @@ f"> */ /* Quick return if possible */ if (*n <= 0 || *m <= 0) { - return 0; + return; } /* The first N entries of WORK are reserved for the eigenvalues */ @@ -1070,7 +1070,7 @@ f"> */ /* This is a crude protection against infinitely deep trees */ if (ndepth > *m) { *info = -2; - return 0; + return; } /* breadth first processing of the current level of the representation */ /* tree: OLDNCL = number of clusters on current level */ @@ -1156,7 +1156,7 @@ f"> */ iindwk], pivmin, &spdiam, &in, &iinfo); if (iinfo != 0) { *info = -1; - return 0; + return; } /* We also recompute the extremal gaps. W holds all eigenvalues */ /* of the unshifted matrix and must be used for computation */ @@ -1343,7 +1343,7 @@ f"> */ iwork[k] = newlst; } else { *info = -2; - return 0; + return; } } else { @@ -1447,7 +1447,7 @@ f"> */ iindwk], pivmin, &spdiam, &itmp1, &iinfo); if (iinfo != 0) { *info = -3; - return 0; + return; } lambda = work[windex]; /* Reset twist index from inaccurate LAMBDA to */ @@ -1542,7 +1542,7 @@ f"> */ goto L120; } else { *info = 5; - return 0; + return; } } else { stp2ii = FALSE_; @@ -1637,7 +1637,7 @@ f"> */ ; } - return 0; + return; /* End of ZLARRV */ diff --git a/lapack-netlib/SRC/zlarscl2.c b/lapack-netlib/SRC/zlarscl2.c index 1f7be77b28..2493468e78 100644 --- a/lapack-netlib/SRC/zlarscl2.c +++ b/lapack-netlib/SRC/zlarscl2.c @@ -600,7 +600,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlarscl2_(integer *m, integer *n, doublereal *d__, +/* Subroutine */ void zlarscl2_(integer *m, integer *n, doublereal *d__, doublecomplex *x, integer *ldx) { /* System generated locals */ @@ -638,6 +638,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__3].r = z__1.r, x[i__3].i = z__1.i; } } - return 0; + return; } /* zlarscl2_ */ diff --git a/lapack-netlib/SRC/zlarscl2.f b/lapack-netlib/SRC/zlarscl2.f index 4a1e1603a4..e618659067 100644 --- a/lapack-netlib/SRC/zlarscl2.f +++ b/lapack-netlib/SRC/zlarscl2.f @@ -1,4 +1,4 @@ -*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a vector. +*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> ZLARSCL2 performs a reciprocal diagonal scaling on an vector: +*> ZLARSCL2 performs a reciprocal diagonal scaling on a matrix: *> x <-- inv(D) * x *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX*16 array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zlartg.c b/lapack-netlib/SRC/zlartg.c index b0b5621d19..f8f7260258 100644 --- a/lapack-netlib/SRC/zlartg.c +++ b/lapack-netlib/SRC/zlartg.c @@ -614,7 +614,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlartg_(doublecomplex *f, doublecomplex *g, doublereal * +/* Subroutine */ void zlartg_(doublecomplex *f, doublecomplex *g, doublereal * cs, doublecomplex *sn, doublecomplex *r__) { /* System generated locals */ @@ -681,7 +681,7 @@ f"> */ *cs = 1.; sn->r = 0., sn->i = 0.; r__->r = f->r, r__->i = f->i; - return 0; + return; } L20: --count; @@ -722,7 +722,7 @@ f"> */ d__2 = -d_imag(&gs) / d__; z__1.r = d__1, z__1.i = d__2; sn->r = z__1.r, sn->i = z__1.i; - return 0; + return; } d__1 = fs.r; d__2 = d_imag(&fs); @@ -811,7 +811,7 @@ f"> */ } } } - return 0; + return; /* End of ZLARTG */ diff --git a/lapack-netlib/SRC/zlartg.f90 b/lapack-netlib/SRC/zlartg.f90 index 337a4dda85..a4f9bd4b00 100644 --- a/lapack-netlib/SRC/zlartg.f90 +++ b/lapack-netlib/SRC/zlartg.f90 @@ -11,8 +11,8 @@ ! SUBROUTINE ZLARTG( F, G, C, S, R ) ! ! .. Scalar Arguments .. -! REAL(wp) C -! COMPLEX(wp) F, G, R, S +! REAL(wp) C +! COMPLEX(wp) F, G, R, S ! .. ! !> \par Purpose: @@ -30,7 +30,7 @@ !> The mathematical formulas used for C and S are !> !> sgn(x) = { x / |x|, x != 0 -!> { 1, x = 0 +!> { 1, x = 0 !> !> R = sgn(F) * sqrt(|F|**2 + |G|**2) !> @@ -38,6 +38,10 @@ !> !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2) !> +!> Special conditions: +!> If G=0, then C=1 and S=0. +!> If F=0, then C=0 and S is chosen so that R is real. +!> !> When F and G are real, the formulas simplify to C = F/R and !> S = G/R, and the returned values of C, S, and R should be !> identical to those returned by DLARTG. @@ -46,11 +50,8 @@ !> to avoid overflow or underflow in computing the square root of the !> sum of squares. !> -!> This is a faster version of the BLAS1 routine ZROTG, except for -!> the following differences: -!> F and G are unchanged on return. -!> If G=0, then C=1 and S=0. -!> If F=0, then C=0 and S is chosen so that R is real. +!> This is the same routine ZROTG fom BLAS1, except that +!> F and G are unchanged on return. !> !> Below, wp=>dp stands for double precision from LA_CONSTANTS module. !> \endverbatim @@ -91,22 +92,19 @@ ! Authors: ! ======== ! -!> \author Edward Anderson, Lockheed Martin +!> \author Weslley Pereira, University of Colorado Denver, USA ! -!> \date August 2016 +!> \date December 2021 ! !> \ingroup OTHERauxiliary ! -!> \par Contributors: -! ================== -!> -!> Weslley Pereira, University of Colorado Denver, USA -! !> \par Further Details: ! ===================== !> !> \verbatim !> +!> Based on the algorithm from +!> !> Anderson E. (2017) !> Algorithm 978: Safe Scaling in the Level 1 BLAS !> ACM Trans Math Softw 44:1--28 @@ -117,7 +115,7 @@ subroutine ZLARTG( f, g, c, s, r ) use LA_CONSTANTS, & only: wp=>dp, zero=>dzero, one=>done, two=>dtwo, czero=>zzero, & - rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax + safmin=>dsafmin, safmax=>dsafmax ! ! -- LAPACK auxiliary routine -- ! -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -129,7 +127,7 @@ subroutine ZLARTG( f, g, c, s, r ) complex(wp) f, g, r, s ! .. ! .. Local Scalars .. - real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w + real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax complex(wp) :: fs, gs, t ! .. ! .. Intrinsic Functions .. @@ -141,6 +139,9 @@ subroutine ZLARTG( f, g, c, s, r ) ! .. Statement Function definitions .. ABSSQ( t ) = real( t )**2 + aimag( t )**2 ! .. +! .. Constants .. + rtmin = sqrt( safmin ) +! .. ! .. Executable Statements .. ! if( g == czero ) then @@ -149,30 +150,43 @@ subroutine ZLARTG( f, g, c, s, r ) r = f else if( f == czero ) then c = zero - g1 = max( abs(real(g)), abs(aimag(g)) ) - if( g1 > rtmin .and. g1 < rtmax ) then + if( real(g) == zero ) then + r = abs(aimag(g)) + s = conjg( g ) / r + elseif( aimag(g) == zero ) then + r = abs(real(g)) + s = conjg( g ) / r + else + g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/2 ) + if( g1 > rtmin .and. g1 < rtmax ) then ! ! Use unscaled algorithm ! - g2 = ABSSQ( g ) - d = sqrt( g2 ) - s = conjg( g ) / d - r = d - else +! The following two lines can be replaced by `d = abs( g )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( g ) + d = sqrt( g2 ) + s = conjg( g ) / d + r = d + else ! ! Use scaled algorithm ! - u = min( safmax, max( safmin, g1 ) ) - uu = one / u - gs = g*uu - g2 = ABSSQ( gs ) - d = sqrt( g2 ) - s = conjg( gs ) / d - r = d*u + u = min( safmax, max( safmin, g1 ) ) + gs = g / u +! The following two lines can be replaced by `d = abs( gs )`. +! This algorithm do not use the intrinsic complex abs. + g2 = ABSSQ( gs ) + d = sqrt( g2 ) + s = conjg( gs ) / d + r = d*u + end if end if else f1 = max( abs(real(f)), abs(aimag(f)) ) g1 = max( abs(real(g)), abs(aimag(g)) ) + rtmax = sqrt( safmax/4 ) if( f1 > rtmin .and. f1 < rtmax .and. & g1 > rtmin .and. g1 < rtmax ) then ! @@ -181,32 +195,51 @@ subroutine ZLARTG( f, g, c, s, r ) f2 = ABSSQ( f ) g2 = ABSSQ( g ) h2 = f2 + g2 - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = f / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( g ) * ( f / sqrt( f2*h2 ) ) + else + s = conjg( g ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = f / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = f * ( h2 / d ) + end if + s = conjg( g ) * ( f / d ) end if - p = 1 / d - c = f2*p - s = conjg( g )*( f*p ) - r = f*( h2*p ) else ! ! Use scaled algorithm ! u = min( safmax, max( safmin, f1, g1 ) ) - uu = one / u - gs = g*uu + gs = g / u g2 = ABSSQ( gs ) - if( f1*uu < rtmin ) then + if( f1 / u < rtmin ) then ! ! f is not well-scaled when scaled by g1. ! Use a different scaling for f. ! v = min( safmax, max( safmin, f1 ) ) - vv = one / v - w = v * uu - fs = f*vv + w = v / u + fs = f / v f2 = ABSSQ( fs ) h2 = f2*w**2 + g2 else @@ -214,19 +247,43 @@ subroutine ZLARTG( f, g, c, s, r ) ! Otherwise use the same scaling for f and g. ! w = one - fs = f*uu + fs = f / u f2 = ABSSQ( fs ) h2 = f2 + g2 end if - if( f2 > rtmin .and. h2 < rtmax ) then - d = sqrt( f2*h2 ) + ! safmin <= f2 <= h2 <= safmax + if( f2 >= h2 * safmin ) then + ! safmin <= f2/h2 <= 1, and h2/f2 is finite + c = sqrt( f2 / h2 ) + r = fs / c + rtmax = rtmax * 2 + if( f2 > rtmin .and. h2 < rtmax ) then + ! safmin <= sqrt( f2*h2 ) <= safmax + s = conjg( gs ) * ( fs / sqrt( f2*h2 ) ) + else + s = conjg( gs ) * ( r / h2 ) + end if else - d = sqrt( f2 )*sqrt( h2 ) + ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow. + ! Moreover, + ! safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax, + ! sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax). + ! Also, + ! g2 >> f2, which means that h2 = g2. + d = sqrt( f2 * h2 ) + c = f2 / d + if( c >= safmin ) then + r = fs / c + else + ! f2 / sqrt(f2 * h2) < safmin, then + ! sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax + r = fs * ( h2 / d ) + end if + s = conjg( gs ) * ( fs / d ) end if - p = 1 / d - c = ( f2*p )*w - s = conjg( gs )*( fs*p ) - r = ( fs*( h2*p ) )*u + ! Rescale c and r + c = c * w + r = r * u end if end if return diff --git a/lapack-netlib/SRC/zlartv.c b/lapack-netlib/SRC/zlartv.c index 850e0934fe..b578b8bf74 100644 --- a/lapack-netlib/SRC/zlartv.c +++ b/lapack-netlib/SRC/zlartv.c @@ -617,7 +617,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlartv_(integer *n, doublecomplex *x, integer *incx, +/* Subroutine */ void zlartv_(integer *n, doublecomplex *x, integer *incx, doublecomplex *y, integer *incy, doublereal *c__, doublecomplex *s, integer *incc) { @@ -676,7 +676,7 @@ f"> */ ic += *incc; /* L10: */ } - return 0; + return; /* End of ZLARTV */ diff --git a/lapack-netlib/SRC/zlarz.c b/lapack-netlib/SRC/zlarz.c index 8057204459..e4a4ea2541 100644 --- a/lapack-netlib/SRC/zlarz.c +++ b/lapack-netlib/SRC/zlarz.c @@ -661,7 +661,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarz_(char *side, integer *m, integer *n, integer *l, +/* Subroutine */ void zlarz_(char *side, integer *m, integer *n, integer *l, doublecomplex *v, integer *incv, doublecomplex *tau, doublecomplex * c__, integer *ldc, doublecomplex *work) { @@ -671,7 +671,7 @@ static integer c__1 = 1; /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -762,7 +762,7 @@ static integer c__1 = 1; } - return 0; + return; /* End of ZLARZ */ diff --git a/lapack-netlib/SRC/zlarzb.c b/lapack-netlib/SRC/zlarzb.c index fbfaad6d39..5416b5e348 100644 --- a/lapack-netlib/SRC/zlarzb.c +++ b/lapack-netlib/SRC/zlarzb.c @@ -696,7 +696,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarzb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void zlarzb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *ldwork) @@ -709,14 +709,15 @@ f"> */ /* Local variables */ integer info, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen), - zlacgv_(integer *, doublecomplex *, integer *); + doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_(integer *, doublecomplex *, integer *); char transt[1]; @@ -747,7 +748,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0) { - return 0; + return; } /* Check for currently supported options */ @@ -761,7 +762,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("ZLARZB", &i__1, (ftnlen)6); - return 0; + return; } if (lsame_(trans, "N")) { @@ -902,7 +903,7 @@ f"> */ } - return 0; + return; /* End of ZLARZB */ diff --git a/lapack-netlib/SRC/zlarzt.c b/lapack-netlib/SRC/zlarzt.c index 80533ff39d..036c7209ed 100644 --- a/lapack-netlib/SRC/zlarzt.c +++ b/lapack-netlib/SRC/zlarzt.c @@ -699,7 +699,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlarzt_(char *direct, char *storev, integer *n, integer * +/* Subroutine */ void zlarzt_(char *direct, char *storev, integer *n, integer * k, doublecomplex *v, integer *ldv, doublecomplex *tau, doublecomplex * t, integer *ldt) { @@ -710,13 +710,13 @@ f"> */ /* Local variables */ integer info, i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrmv_(char *, char *, char *, integer *, doublecomplex *, - integer *, doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen), zlacgv_(integer *, - doublecomplex *, integer *); + integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_(integer *, doublecomplex *, integer *); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -749,7 +749,7 @@ f"> */ if (info != 0) { i__1 = -info; xerbla_("ZLARZT", &i__1, (ftnlen)6); - return 0; + return; } for (i__ = *k; i__ >= 1; --i__) { @@ -794,7 +794,7 @@ f"> */ } /* L20: */ } - return 0; + return; /* End of ZLARZT */ diff --git a/lapack-netlib/SRC/zlascl.c b/lapack-netlib/SRC/zlascl.c index 4845cdb955..73b2f29ffa 100644 --- a/lapack-netlib/SRC/zlascl.c +++ b/lapack-netlib/SRC/zlascl.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlascl_(char *type__, integer *kl, integer *ku, +/* Subroutine */ void zlascl_(char *type__, integer *kl, integer *ku, doublereal *cfrom, doublereal *cto, integer *m, integer *n, doublecomplex *a, integer *lda, integer *info) { @@ -744,13 +744,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLASCL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } /* Get machine parameters */ @@ -934,7 +934,7 @@ f"> */ goto L10; } - return 0; + return; /* End of ZLASCL */ diff --git a/lapack-netlib/SRC/zlascl.f b/lapack-netlib/SRC/zlascl.f index 3d53f5ae60..4cce5ff5e0 100644 --- a/lapack-netlib/SRC/zlascl.f +++ b/lapack-netlib/SRC/zlascl.f @@ -272,6 +272,8 @@ SUBROUTINE ZLASCL( TYPE, KL, KU, CFROM, CTO, M, N, A, LDA, INFO ) ELSE MUL = CTOC / CFROMC DONE = .TRUE. + IF (MUL .EQ. ONE) + $ RETURN END IF END IF * diff --git a/lapack-netlib/SRC/zlascl2.c b/lapack-netlib/SRC/zlascl2.c index 8c095dca00..076bab027c 100644 --- a/lapack-netlib/SRC/zlascl2.c +++ b/lapack-netlib/SRC/zlascl2.c @@ -600,7 +600,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlascl2_(integer *m, integer *n, doublereal *d__, +/* Subroutine */ void zlascl2_(integer *m, integer *n, doublereal *d__, doublecomplex *x, integer *ldx) { /* System generated locals */ @@ -638,6 +638,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ x[i__3].r = z__1.r, x[i__3].i = z__1.i; } } - return 0; + return; } /* zlascl2_ */ diff --git a/lapack-netlib/SRC/zlascl2.f b/lapack-netlib/SRC/zlascl2.f index c4e6992fbe..26406c3636 100644 --- a/lapack-netlib/SRC/zlascl2.f +++ b/lapack-netlib/SRC/zlascl2.f @@ -1,4 +1,4 @@ -*> \brief \b ZLASCL2 performs diagonal scaling on a vector. +*> \brief \b ZLASCL2 performs diagonal scaling on a matrix. * * =========== DOCUMENTATION =========== * @@ -34,7 +34,7 @@ *> *> \verbatim *> -*> ZLASCL2 performs a diagonal scaling on a vector: +*> ZLASCL2 performs a diagonal scaling on a matrix: *> x <-- D * x *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector. *> @@ -66,14 +66,14 @@ *> \param[in,out] X *> \verbatim *> X is COMPLEX*16 array, dimension (LDX,N) -*> On entry, the vector X to be scaled by D. -*> On exit, the scaled vector. +*> On entry, the matrix X to be scaled by D. +*> On exit, the scaled matrix. *> \endverbatim *> *> \param[in] LDX *> \verbatim *> LDX is INTEGER -*> The leading dimension of the vector X. LDX >= M. +*> The leading dimension of the matrix X. LDX >= M. *> \endverbatim * * Authors: diff --git a/lapack-netlib/SRC/zlaset.c b/lapack-netlib/SRC/zlaset.c index eabfe2a796..9e3f21d728 100644 --- a/lapack-netlib/SRC/zlaset.c +++ b/lapack-netlib/SRC/zlaset.c @@ -616,7 +616,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlaset_(char *uplo, integer *m, integer *n, +/* Subroutine */ void zlaset_(char *uplo, integer *m, integer *n, doublecomplex *alpha, doublecomplex *beta, doublecomplex *a, integer * lda) { @@ -712,7 +712,7 @@ f"> */ } } - return 0; + return; /* End of ZLASET */ diff --git a/lapack-netlib/SRC/zlasr.c b/lapack-netlib/SRC/zlasr.c index c1d8fc5b43..983f7b3d3f 100644 --- a/lapack-netlib/SRC/zlasr.c +++ b/lapack-netlib/SRC/zlasr.c @@ -709,7 +709,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlasr_(char *side, char *pivot, char *direct, integer *m, +/* Subroutine */ void zlasr_(char *side, char *pivot, char *direct, integer *m, integer *n, doublereal *c__, doublereal *s, doublecomplex *a, integer *lda) { @@ -763,13 +763,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("ZLASR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (lsame_(side, "L")) { @@ -1161,7 +1161,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of ZLASR */ diff --git a/lapack-netlib/SRC/zlassq.c b/lapack-netlib/SRC/zlassq.c index 480673c162..5e9fade410 100644 --- a/lapack-netlib/SRC/zlassq.c +++ b/lapack-netlib/SRC/zlassq.c @@ -615,7 +615,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlassq_(integer *n, doublecomplex *x, integer *incx, +/* Subroutine */ void zlassq_(integer *n, doublecomplex *x, integer *incx, doublereal *scale, doublereal *sumsq) { /* System generated locals */ @@ -676,7 +676,7 @@ f"> */ } } - return 0; + return; /* End of ZLASSQ */ diff --git a/lapack-netlib/SRC/zlaswlq.c b/lapack-netlib/SRC/zlaswlq.c index 14cebe5bfc..aa46bfa292 100644 --- a/lapack-netlib/SRC/zlaswlq.c +++ b/lapack-netlib/SRC/zlaswlq.c @@ -671,7 +671,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaswlq_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void zlaswlq_(integer *m, integer *n, integer *mb, integer * nb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *lwork, integer *info) { @@ -680,11 +680,12 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgelqt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgelqt_( integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical lquery; - extern /* Subroutine */ int ztplqt_(integer *, integer *, integer *, + extern /* Subroutine */ void ztplqt_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ctr; @@ -738,15 +739,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("ZLASWLQ", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The LQ Decomposition */ @@ -754,7 +755,7 @@ static integer c__0 = 0; if (*m >= *n || *nb <= *m || *nb >= *n) { zgelqt_(m, n, mb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*n - *m) % (*nb - *m); @@ -787,7 +788,7 @@ static integer c__0 = 0; i__2 = *m * *mb; work[1].r = (doublereal) i__2, work[1].i = 0.; - return 0; + return; /* End of ZLASWLQ */ diff --git a/lapack-netlib/SRC/zlaswp.c b/lapack-netlib/SRC/zlaswp.c index cc1aab79a6..84616be2b0 100644 --- a/lapack-netlib/SRC/zlaswp.c +++ b/lapack-netlib/SRC/zlaswp.c @@ -624,7 +624,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlaswp_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zlaswp_(integer *n, doublecomplex *a, integer *lda, integer *k1, integer *k2, integer *ipiv, integer *incx) { /* System generated locals */ @@ -665,7 +665,7 @@ f"> */ i2 = *k1; inc = -1; } else { - return 0; + return; } n32 = *n / 32 << 5; @@ -722,7 +722,7 @@ f"> */ } } - return 0; + return; /* End of ZLASWP */ diff --git a/lapack-netlib/SRC/zlasyf.c b/lapack-netlib/SRC/zlasyf.c index e414864be0..f52e18b6a2 100644 --- a/lapack-netlib/SRC/zlasyf.c +++ b/lapack-netlib/SRC/zlasyf.c @@ -692,7 +692,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlasyf_(char *uplo, integer *n, integer *nb, integer *kb, +/* Subroutine */ void zlasyf_(char *uplo, integer *n, integer *nb, integer *kb, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *w, integer *ldw, integer *info) { @@ -706,17 +706,17 @@ f"> */ doublecomplex t; doublereal alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublecomplex r1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d21, d22; @@ -1538,7 +1538,7 @@ f"> */ *kb = k - 1; } - return 0; + return; /* End of ZLASYF */ diff --git a/lapack-netlib/SRC/zlasyf_aa.c b/lapack-netlib/SRC/zlasyf_aa.c index d9b7b267a0..6a769f996f 100644 --- a/lapack-netlib/SRC/zlasyf_aa.c +++ b/lapack-netlib/SRC/zlasyf_aa.c @@ -659,7 +659,7 @@ aa.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zlasyf_aa_(char *uplo, integer *j1, integer *m, integer +/* Subroutine */ void zlasyf_aa_(char *uplo, integer *j1, integer *m, integer *nb, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex * h__, integer *ldh, doublecomplex *work) { @@ -671,19 +671,19 @@ aa.f"> */ integer j, k; doublecomplex alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i1, k1, i2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer mj; extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublecomplex piv; @@ -1078,7 +1078,7 @@ aa.f"> */ L40: ; } - return 0; + return; /* End of ZLASYF_AA */ diff --git a/lapack-netlib/SRC/zlasyf_rk.c b/lapack-netlib/SRC/zlasyf_rk.c index f12fa9c9a0..a8690e3b09 100644 --- a/lapack-netlib/SRC/zlasyf_rk.c +++ b/lapack-netlib/SRC/zlasyf_rk.c @@ -776,7 +776,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlasyf_rk_(char *uplo, integer *n, integer *nb, integer +/* Subroutine */ void zlasyf_rk_(char *uplo, integer *n, integer *nb, integer *kb, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *w, integer *ldw, integer *info) { @@ -792,19 +792,19 @@ rk.f"> */ doublereal alpha; extern logical lsame_(char *, char *); doublereal dtemp, sfmin; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer itemp; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublecomplex r1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d12, d21, d22; @@ -1713,7 +1713,7 @@ rk.f"> */ } - return 0; + return; /* End of ZLASYF_RK */ diff --git a/lapack-netlib/SRC/zlasyf_rook.c b/lapack-netlib/SRC/zlasyf_rook.c index 9abebc804a..09ff4c3040 100644 --- a/lapack-netlib/SRC/zlasyf_rook.c +++ b/lapack-netlib/SRC/zlasyf_rook.c @@ -698,7 +698,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlasyf_rook_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void zlasyf_rook_(char *uplo, integer *n, integer *nb, integer *kb, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *w, integer *ldw, integer *info) { @@ -714,19 +714,19 @@ rook.f"> */ doublereal alpha; extern logical lsame_(char *, char *); doublereal dtemp, sfmin; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer itemp; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublecomplex r1; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d12, d21, d22; @@ -1638,7 +1638,7 @@ rook.f"> */ *kb = k - 1; } - return 0; + return; /* End of ZLASYF_ROOK */ diff --git a/lapack-netlib/SRC/zlat2c.c b/lapack-netlib/SRC/zlat2c.c index dd70c7e6fd..a11ce7f970 100644 --- a/lapack-netlib/SRC/zlat2c.c +++ b/lapack-netlib/SRC/zlat2c.c @@ -620,7 +620,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlat2c_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zlat2c_(char *uplo, integer *n, doublecomplex *a, integer *lda, complex *sa, integer *ldsa, integer *info) { /* System generated locals */ @@ -697,7 +697,7 @@ f"> */ } L50: - return 0; + return; /* End of ZLAT2C */ diff --git a/lapack-netlib/SRC/zlat2c.f b/lapack-netlib/SRC/zlat2c.f index 1d607dcea1..a413b05c14 100644 --- a/lapack-netlib/SRC/zlat2c.f +++ b/lapack-netlib/SRC/zlat2c.f @@ -130,7 +130,7 @@ SUBROUTINE ZLAT2C( UPLO, N, A, LDA, SA, LDSA, INFO ) LOGICAL UPPER * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, DIMAG + INTRINSIC DBLE, DIMAG, CMPLX * .. * .. External Functions .. REAL SLAMCH @@ -151,7 +151,7 @@ SUBROUTINE ZLAT2C( UPLO, N, A, LDA, SA, LDSA, INFO ) INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 10 CONTINUE 20 CONTINUE ELSE @@ -164,7 +164,7 @@ SUBROUTINE ZLAT2C( UPLO, N, A, LDA, SA, LDSA, INFO ) INFO = 1 GO TO 50 END IF - SA( I, J ) = A( I, J ) + SA( I, J ) = CMPLX( A( I, J ) ) 30 CONTINUE 40 CONTINUE END IF diff --git a/lapack-netlib/SRC/zlatbs.c b/lapack-netlib/SRC/zlatbs.c index 04925e90c1..f862c1488d 100644 --- a/lapack-netlib/SRC/zlatbs.c +++ b/lapack-netlib/SRC/zlatbs.c @@ -756,7 +756,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatbs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void zlatbs_(char *uplo, char *trans, char *diag, char * normin, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublecomplex *x, doublereal *scale, doublereal *cnorm, integer *info) { @@ -773,7 +773,7 @@ f"> */ doublecomplex tjjs; doublereal xmax, grow; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer maind; extern logical lsame_(char *, char *); @@ -786,14 +786,15 @@ f"> */ logical upper; extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int ztbsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztbsv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_( doublereal *, doublereal *); extern doublereal dlamch_(char *); doublereal xj; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -851,13 +852,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLATBS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1752,7 +1753,7 @@ f"> */ dscal_(n, &d__1, &cnorm[1], &c__1); } - return 0; + return; /* End of ZLATBS */ diff --git a/lapack-netlib/SRC/zlatbs.f b/lapack-netlib/SRC/zlatbs.f index b7b2cb8aec..bdffa1ea98 100644 --- a/lapack-netlib/SRC/zlatbs.f +++ b/lapack-netlib/SRC/zlatbs.f @@ -278,7 +278,7 @@ SUBROUTINE ZLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, $ ZDOTU, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV, DLABAD + EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -324,17 +324,14 @@ SUBROUTINE ZLATBS( UPLO, TRANS, DIAG, NORMIN, N, KD, AB, LDAB, X, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = DLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / DLAMCH( 'Precision' ) + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * diff --git a/lapack-netlib/SRC/zlatdf.c b/lapack-netlib/SRC/zlatdf.c index 7debf0ecce..125fc1752d 100644 --- a/lapack-netlib/SRC/zlatdf.c +++ b/lapack-netlib/SRC/zlatdf.c @@ -685,7 +685,7 @@ f"> */ /* > 1995. */ /* ===================================================================== */ -/* Subroutine */ int zlatdf_(integer *ijob, integer *n, doublecomplex *z__, +/* Subroutine */ void zlatdf_(integer *ijob, integer *n, doublecomplex *z__, integer *ldz, doublecomplex *rhs, doublereal *rdsum, doublereal * rdscal, integer *ipiv, integer *jpiv) { @@ -698,26 +698,27 @@ f"> */ doublecomplex temp, work[8]; integer i__, j, k; doublereal scale; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); doublecomplex pmone; extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal rtemp, sminu, rwork[2]; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal splus; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zgesc2_( integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublereal *); doublecomplex bm, bp, xm[2], xp[2]; - extern /* Subroutine */ int zgecon_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zgecon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *); extern doublereal dzasum_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, - doublereal *, doublereal *), zlaswp_(integer *, doublecomplex *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, + doublereal *, doublereal *); + extern int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); @@ -878,7 +879,7 @@ f"> */ /* Compute the sum of squares */ zlassq_(n, &rhs[1], &c__1, rdscal, rdsum); - return 0; + return; } /* ENTRY IJOB = 2 */ @@ -910,7 +911,7 @@ f"> */ /* Compute the sum of squares */ zlassq_(n, &rhs[1], &c__1, rdscal, rdsum); - return 0; + return; /* End of ZLATDF */ diff --git a/lapack-netlib/SRC/zlatps.c b/lapack-netlib/SRC/zlatps.c index 0319be7db5..f73d72e719 100644 --- a/lapack-netlib/SRC/zlatps.c +++ b/lapack-netlib/SRC/zlatps.c @@ -744,7 +744,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatps_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void zlatps_(char *uplo, char *trans, char *diag, char * normin, integer *n, doublecomplex *ap, doublecomplex *x, doublereal * scale, doublereal *cnorm, integer *info) { @@ -761,7 +761,7 @@ f"> */ doublecomplex tjjs; doublereal xmax, grow; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal tscal; @@ -773,7 +773,7 @@ f"> */ logical upper; extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztpsv_( char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *), dlabad_(doublereal *, @@ -782,7 +782,8 @@ f"> */ integer ip; doublereal xj; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -834,13 +835,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLATPS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1715,7 +1716,7 @@ f"> */ dscal_(n, &d__1, &cnorm[1], &c__1); } - return 0; + return; /* End of ZLATPS */ diff --git a/lapack-netlib/SRC/zlatrd.c b/lapack-netlib/SRC/zlatrd.c index 581211ac02..81a3479f7f 100644 --- a/lapack-netlib/SRC/zlatrd.c +++ b/lapack-netlib/SRC/zlatrd.c @@ -715,7 +715,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatrd_(char *uplo, integer *n, integer *nb, +/* Subroutine */ void zlatrd_(char *uplo, integer *n, integer *nb, doublecomplex *a, integer *lda, doublereal *e, doublecomplex *tau, doublecomplex *w, integer *ldw) { @@ -728,11 +728,11 @@ f"> */ integer i__; doublecomplex alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zhemv_(char *, integer *, doublecomplex *, doublecomplex *, @@ -741,7 +741,7 @@ f"> */ doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer iw; - extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlacgv_(integer *, doublecomplex *, integer *); @@ -769,7 +769,7 @@ f"> */ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (lsame_(uplo, "U")) { @@ -976,7 +976,7 @@ f"> */ } } - return 0; + return; /* End of ZLATRD */ diff --git a/lapack-netlib/SRC/zlatrs.c b/lapack-netlib/SRC/zlatrs.c index eec839a778..71122e2cc1 100644 --- a/lapack-netlib/SRC/zlatrs.c +++ b/lapack-netlib/SRC/zlatrs.c @@ -753,7 +753,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatrs_(char *uplo, char *trans, char *diag, char * +/* Subroutine */ void zlatrs_(char *uplo, char *trans, char *diag, char * normin, integer *n, doublecomplex *a, integer *lda, doublecomplex *x, doublereal *scale, doublereal *cnorm, integer *info) { @@ -770,7 +770,7 @@ f"> */ doublecomplex tjjs; doublereal xmax, grow; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); doublereal tscal; @@ -782,7 +782,7 @@ f"> */ logical upper; extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsv_( char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_( @@ -790,7 +790,8 @@ f"> */ extern doublereal dlamch_(char *); doublereal xj; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal bignum; extern integer izamax_(integer *, doublecomplex *, integer *); @@ -846,13 +847,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLATRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine machine dependent parameters to control overflow. */ @@ -1705,7 +1706,7 @@ f"> */ dscal_(n, &d__1, &cnorm[1], &c__1); } - return 0; + return; /* End of ZLATRS */ diff --git a/lapack-netlib/SRC/zlatrs.f b/lapack-netlib/SRC/zlatrs.f index 91bb688ece..2276ace875 100644 --- a/lapack-netlib/SRC/zlatrs.f +++ b/lapack-netlib/SRC/zlatrs.f @@ -274,7 +274,7 @@ SUBROUTINE ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, $ ZDOTU, ZLADIV * .. * .. External Subroutines .. - EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV, DLABAD + EXTERNAL DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN @@ -318,17 +318,14 @@ SUBROUTINE ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, * * Quick return if possible * + SCALE = ONE IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * - SMLNUM = DLAMCH( 'Safe minimum' ) - BIGNUM = ONE / SMLNUM - CALL DLABAD( SMLNUM, BIGNUM ) - SMLNUM = SMLNUM / DLAMCH( 'Precision' ) + SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' ) BIGNUM = ONE / SMLNUM - SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * @@ -360,8 +357,74 @@ SUBROUTINE ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X, SCALE, IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE - TSCAL = HALF / ( SMLNUM*TMAX ) - CALL DSCAL( N, TSCAL, CNORM, 1 ) +* +* Avoid NaN generation if entries in CNORM exceed the +* overflow threshold +* + IF ( TMAX.LE.DLAMCH('Overflow') ) THEN +* Case 1: All entries in CNORM are valid floating-point numbers + TSCAL = HALF / ( SMLNUM*TMAX ) + CALL DSCAL( N, TSCAL, CNORM, 1 ) + ELSE +* Case 2: At least one column norm of A cannot be +* represented as a floating-point number. Find the +* maximum offdiagonal absolute value +* max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is +* not +/- Infinity, use this value as TSCAL. + TMAX = ZERO + IF( UPPER ) THEN +* +* A is upper triangular. +* + DO J = 2, N + DO I = 1, J - 1 + TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ), + $ ABS( DIMAG(A ( I, J ) ) ) ) + END DO + END DO + ELSE +* +* A is lower triangular. +* + DO J = 1, N - 1 + DO I = J + 1, N + TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ), + $ ABS( DIMAG(A ( I, J ) ) ) ) + END DO + END DO + END IF +* + IF( TMAX.LE.DLAMCH('Overflow') ) THEN + TSCAL = ONE / ( SMLNUM*TMAX ) + DO J = 1, N + IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN + CNORM( J ) = CNORM( J )*TSCAL + ELSE +* Recompute the 1-norm of each column without +* introducing Infinity in the summation. + TSCAL = TWO * TSCAL + CNORM( J ) = ZERO + IF( UPPER ) THEN + DO I = 1, J - 1 + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + ELSE + DO I = J + 1, N + CNORM( J ) = CNORM( J ) + + $ TSCAL * CABS2( A( I, J ) ) + END DO + END IF + TSCAL = TSCAL * HALF + END IF + END DO + ELSE +* At least one entry of A is not a valid floating-point +* entry. Rely on TRSV to propagate Inf and NaN. + CALL ZTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 ) + RETURN + END IF + END IF END IF * * Compute a bound on the computed solution vector to see if the diff --git a/lapack-netlib/SRC/zlatrs3.c b/lapack-netlib/SRC/zlatrs3.c new file mode 100644 index 0000000000..0afc8d26cb --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.c @@ -0,0 +1,1283 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. + */ + +/* Definition: */ +/* =========== */ + +/* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */ +/* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */ + +/* CHARACTER DIAG, NORMIN, TRANS, UPLO */ +/* INTEGER INFO, LDA, LWORK, LDX, N, NRHS */ +/* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) */ +/* COMPLEX*16 A( LDA, * ), X( LDX, * ) */ + + +/* > \par Purpose: */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZLATRS3 solves one of the triangular systems */ +/* > */ +/* > A * X = B * diag(scale), A**T * X = B * diag(scale), or */ +/* > A**H * X = B * diag(scale) */ +/* > */ +/* > with scaling to prevent overflow. Here A is an upper or lower */ +/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */ +/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */ +/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */ +/* > is usually less than or equal to 1, chosen such that X(:,j) is less */ +/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */ +/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */ +/* > the system is so badly scaled that the solution cannot be represented */ +/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */ +/* > */ +/* > This is a BLAS-3 version of LATRS for solving several right */ +/* > hand sides simultaneously. */ +/* > */ +/* > \endverbatim */ + +/* Arguments: */ +/* ========== */ + +/* > \param[in] UPLO */ +/* > \verbatim */ +/* > UPLO is CHARACTER*1 */ +/* > Specifies whether the matrix A is upper or lower triangular. */ +/* > = 'U': Upper triangular */ +/* > = 'L': Lower triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANS */ +/* > \verbatim */ +/* > TRANS is CHARACTER*1 */ +/* > Specifies the operation applied to A. */ +/* > = 'N': Solve A * x = s*b (No transpose) */ +/* > = 'T': Solve A**T* x = s*b (Transpose) */ +/* > = 'C': Solve A**T* x = s*b (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] DIAG */ +/* > \verbatim */ +/* > DIAG is CHARACTER*1 */ +/* > Specifies whether or not the matrix A is unit triangular. */ +/* > = 'N': Non-unit triangular */ +/* > = 'U': Unit triangular */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NORMIN */ +/* > \verbatim */ +/* > NORMIN is CHARACTER*1 */ +/* > Specifies whether CNORM has been set or not. */ +/* > = 'Y': CNORM contains the column norms on entry */ +/* > = 'N': CNORM is not set on entry. On exit, the norms will */ +/* > be computed and stored in CNORM. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix A. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] NRHS */ +/* > \verbatim */ +/* > NRHS is INTEGER */ +/* > The number of columns of X. NRHS >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,N) */ +/* > The triangular matrix A. If UPLO = 'U', the leading n by n */ +/* > upper triangular part of the array A contains the upper */ +/* > triangular matrix, and the strictly lower triangular part of */ +/* > A is not referenced. If UPLO = 'L', the leading n by n lower */ +/* > triangular part of the array A contains the lower triangular */ +/* > matrix, and the strictly upper triangular part of A is not */ +/* > referenced. If DIAG = 'U', the diagonal elements of A are */ +/* > also not referenced and are assumed to be 1. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] X */ +/* > \verbatim */ +/* > X is COMPLEX*16 array, dimension (LDX,NRHS) */ +/* > On entry, the right hand side B of the triangular system. */ +/* > On exit, X is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDX */ +/* > \verbatim */ +/* > LDX is INTEGER */ +/* > The leading dimension of the array X. LDX >= f2cmax (1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION array, dimension (NRHS) */ +/* > The scaling factor s(k) is for the triangular system */ +/* > A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). */ +/* > If SCALE = 0, the matrix A is singular or badly scaled. */ +/* > If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */ +/* > that is an exact or approximate solution to A*x(:,k) = 0 */ +/* > is returned. If the system so badly scaled that solution */ +/* > cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */ +/* > is returned. */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] CNORM */ +/* > \verbatim */ +/* > CNORM is DOUBLE PRECISION array, dimension (N) */ +/* > */ +/* > If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */ +/* > contains the norm of the off-diagonal part of the j-th column */ +/* > of A. If TRANS = 'N', CNORM(j) must be greater than or equal */ +/* > to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */ +/* > must be greater than or equal to the 1-norm. */ +/* > */ +/* > If NORMIN = 'N', CNORM is an output argument and CNORM(j) */ +/* > returns the 1-norm of the offdiagonal part of the j-th column */ +/* > of A. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] WORK */ +/* > \verbatim */ +/* > WORK is DOUBLE PRECISION array, dimension (LWORK). */ +/* > On exit, if INFO = 0, WORK(1) returns the optimal size of */ +/* > WORK. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LWORK */ +/* > LWORK is INTEGER */ +/* > LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */ +/* > NBA = (N + NB - 1)/NB and NB is the optimal block size. */ +/* > */ +/* > If LWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the WORK array, returns */ +/* > this value as the first entry of the WORK array, and no error */ +/* > message related to LWORK is issued by XERBLA. */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -k, the k-th argument had an illegal value */ +/* > \endverbatim */ + +/* Authors: */ +/* ======== */ + +/* > \author Univ. of Tennessee */ +/* > \author Univ. of California Berkeley */ +/* > \author Univ. of Colorado Denver */ +/* > \author NAG Ltd. */ + +/* > \ingroup doubleOTHERauxiliary */ +/* > \par Further Details: */ +/* ===================== */ +/* \verbatim */ +/* The algorithm follows the structure of a block triangular solve. */ +/* The diagonal block is solved with a call to the robust the triangular */ +/* solver LATRS for every right-hand side RHS = 1, ..., NRHS */ +/* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */ +/* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */ +/* The linear block updates operate on block columns of X, */ +/* B( I, K ) - op(A( I, J )) * X( J, K ) */ +/* and use GEMM. To avoid overflow in the linear block update, the worst case */ +/* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */ +/* such that */ +/* || s * B( I, RHS )||_oo */ +/* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold */ + +/* Once all columns of a block column have been rescaled (BLAS-1), the linear */ +/* update is executed with GEMM without overflow. */ + +/* To limit rescaling, local scale factors track the scaling of column segments. */ +/* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */ +/* per right-hand side column RHS = 1, ..., NRHS. The global scale factor */ +/* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */ +/* I = 1, ..., NBA. */ +/* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */ +/* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */ +/* linear update of potentially inconsistently scaled vector segments */ +/* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */ +/* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */ +/* if necessary, rescales the blocks prior to calling GEMM. */ + +/* \endverbatim */ +/* ===================================================================== */ +/* References: */ +/* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */ +/* Parallel robust solution of triangular linear systems. Concurrency */ +/* and Computation: Practice and Experience, 31(19), e5064. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void zlatrs3_(char *uplo, char *trans, char *diag, char * + normin, integer *n, integer *nrhs, doublecomplex *a, integer *lda, + doublecomplex *x, integer *ldx, doublereal *scale, doublereal *cnorm, + doublereal *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, + i__6, i__7, i__8; + doublereal d__1, d__2; + doublecomplex z__1; + + /* Local variables */ + integer iinc, jinc; + doublereal scal, anrm, bnrm; + integer awrk; + doublereal tmax, xnrm[32]; + integer i__, j, k; + doublereal w[64]; + extern logical lsame_(char *, char *); + doublereal rscal; + integer lanrm, ilast, jlast; + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1; + logical upper; + integer i2, j1, j2, k1, k2, nb, ii, kk; + extern doublereal dlamch_(char *); + integer lscale; + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ void zdscal_(integer *, doublereal *, + doublecomplex *, integer *); + integer ifirst; + logical notran; + integer jfirst; + doublereal smlnum; + logical nounit; + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, + integer *, doublecomplex *, integer *, doublecomplex *, + doublereal *, doublereal *, integer *); + logical lquery; + integer nba, lds, nbx, rhs; + + + +/* ===================================================================== */ + + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + x_dim1 = *ldx; + x_offset = 1 + x_dim1 * 1; + x -= x_offset; + --scale; + --cnorm; + --work; + + /* Function Body */ + *info = 0; + upper = lsame_(uplo, "U"); + notran = lsame_(trans, "N"); + nounit = lsame_(diag, "N"); + lquery = *lwork == -1; + +/* Partition A and X into blocks. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZLATRS", "", n, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + nb = f2cmin(64,nb); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*nrhs + 31) / 32; + nbx = f2cmax(i__1,i__2); + +/* Compute the workspace */ + +/* The workspace comprises two parts. */ +/* The first part stores the local scale factors. Each simultaneously */ +/* computed right-hand side requires one local scale factor per block */ +/* row. WORK( I + KK * LDS ) is the scale factor of the vector */ +/* segment associated with the I-th block row and the KK-th vector */ +/* in the block column. */ +/* Computing MAX */ + i__1 = nba, i__2 = f2cmin(*nrhs,32); + lscale = nba * f2cmax(i__1,i__2); + lds = nba; +/* The second part stores upper bounds of the triangular A. There are */ +/* a total of NBA x NBA blocks, of which only the upper triangular */ +/* part or the lower triangular part is referenced. The upper bound of */ +/* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */ + lanrm = nba * nba; + awrk = lscale; + work[1] = (doublereal) (lscale + lanrm); + +/* Test the input parameters. */ + + if (! upper && ! lsame_(uplo, "L")) { + *info = -1; + } else if (! notran && ! lsame_(trans, "T") && ! + lsame_(trans, "C")) { + *info = -2; + } else if (! nounit && ! lsame_(diag, "U")) { + *info = -3; + } else if (! lsame_(normin, "Y") && ! lsame_(normin, + "N")) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*nrhs < 0) { + *info = -6; + } else if (*lda < f2cmax(1,*n)) { + *info = -8; + } else if (*ldx < f2cmax(1,*n)) { + *info = -10; + } else if (! lquery && (doublereal) (*lwork) < work[1]) { + *info = -14; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZLATRS3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Initialize scaling factors */ + + i__1 = *nrhs; + for (kk = 1; kk <= i__1; ++kk) { + scale[kk] = 1.; + } + +/* Quick return if possible */ + + if (f2cmin(*n,*nrhs) == 0) { + return; + } + +/* Determine machine dependent constant to control overflow. */ + + bignum = dlamch_("Overflow"); + smlnum = dlamch_("Safe Minimum"); + +/* Use unblocked code for small problems */ + + if (*nrhs < 2) { + zlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + + 1], &scale[1], &cnorm[1], info); + i__1 = *nrhs; + for (k = 2; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Compute norms of blocks of A excluding diagonal blocks and find */ +/* the block with the largest norm TMAX. */ + + tmax = 0.; + i__1 = nba; + for (j = 1; j <= i__1; ++j) { + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + if (upper) { + ifirst = 1; + ilast = j - 1; + } else { + ifirst = j + 1; + ilast = nba; + } + i__2 = ilast; + for (i__ = ifirst; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*n) + 1; + +/* Compute upper bound of A( I1:I2-1, J1:J2-1 ). */ + + if (notran) { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + i__ + (j - 1) * nba] = anrm; + } else { + i__3 = i2 - i1; + i__4 = j2 - j1; + anrm = zlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, + w); + work[awrk + j + (i__ - 1) * nba] = anrm; + } + tmax = f2cmax(tmax,anrm); + } + } + + if (! (tmax <= dlamch_("Overflow"))) { + +/* Some matrix entries have huge absolute value. At least one upper */ +/* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */ +/* number, either due to overflow in LANGE or due to Inf in A. */ +/* Fall back to LATRS. Set normin = 'N' for every right-hand side to */ +/* force computation of TSCAL in LATRS to avoid the likely overflow */ +/* in the computation of the column norms CNORM. */ + + i__1 = *nrhs; + for (k = 1; k <= i__1; ++k) { + zlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * + x_dim1 + 1], &scale[k], &cnorm[1], info); + } + return; + } + +/* Every right-hand side requires workspace to store NBA local scale */ +/* factors. To save workspace, X is computed successively in block columns */ +/* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */ +/* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */ + i__1 = nbx; + for (k = 1; k <= i__1; ++k) { +/* Loop over block columns (index = K) of X and, for column-wise scalings, */ +/* over individual columns (index = KK). */ +/* K1: column index of the first column in X( J, K ) */ +/* K2: column index of the first column in X( J, K+1 ) */ +/* so the K2 - K1 is the column count of the block X( J, K ) */ + k1 = (k - 1 << 5) + 1; +/* Computing MIN */ + i__2 = k << 5; + k2 = f2cmin(i__2,*nrhs) + 1; + +/* Initialize local scaling factors of current block column X( J, K ) */ + + i__2 = k2 - k1; + for (kk = 1; kk <= i__2; ++kk) { + i__3 = nba; + for (i__ = 1; i__ <= i__3; ++i__) { + work[i__ + kk * lds] = 1.; + } + } + + if (notran) { + +/* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ + + if (upper) { + jfirst = nba; + jlast = 1; + jinc = -1; + } else { + jfirst = 1; + jlast = nba; + jinc = 1; + } + } else { + +/* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */ +/* where op(A) = A**T or op(A) = A**H */ + + if (upper) { + jfirst = 1; + jlast = nba; + jinc = 1; + } else { + jfirst = nba; + jlast = 1; + jinc = -1; + } + } + i__2 = jlast; + i__3 = jinc; + for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) { +/* J1: row index of the first row in A( J, J ) */ +/* J2: row index of the first row in A( J+1, J+1 ) */ +/* so that J2 - J1 is the row count of the block A( J, J ) */ + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */ + + i__4 = k2 - k1; + for (kk = 1; kk <= i__4; ++kk) { + rhs = k1 + kk - 1; + if (kk == 1) { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } else { + i__5 = j2 - j1; + zlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * + a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, & + cnorm[1], info); + } +/* Find largest absolute value entry in the vector segment */ +/* X( J1:J2-1, RHS ) as an upper bound for the worst case */ +/* growth in the linear updates. */ + i__5 = j2 - j1; + xnrm[kk - 1] = zlange_("I", &i__5, &c__1, &x[j1 + rhs * + x_dim1], ldx, w); + + if (scaloc == 0.) { +/* LATRS found that A is singular through A(j,j) = 0. */ +/* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */ +/* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */ +/* set by LATRS. */ + scale[rhs] = 0.; + i__5 = j1 - 1; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } + i__5 = *n; + for (ii = j2; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } else if (scaloc * work[j + kk * lds] == 0.) { +/* LATRS computed a valid scale factor, but combined with */ +/* the current scaling the solution does not have a */ +/* scale factor > 0. */ + +/* Set WORK( J+KK*LDS ) to smallest valid scale */ +/* factor and increase SCALOC accordingly. */ + scal = work[j + kk * lds] / smlnum; + scaloc *= scal; + work[j + kk * lds] = smlnum; +/* If LATRS overestimated the growth, x may be */ +/* rescaled to preserve a valid combined scale */ +/* factor WORK( J, KK ) > 0. */ + rscal = 1. / scaloc; + if (xnrm[kk - 1] * rscal <= bignum) { + xnrm[kk - 1] *= rscal; + i__5 = j2 - j1; + zdscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1); + scaloc = 1.; + } else { +/* The system op(A) * x = b is badly scaled and its */ +/* solution cannot be represented as (1/scale) * x. */ +/* Set x to zero. This approach deviates from LATRS */ +/* where a completely meaningless non-zero vector */ +/* is returned that is not a solution to op(A) * x = b. */ + scale[rhs] = 0.; + i__5 = *n; + for (ii = 1; ii <= i__5; ++ii) { + i__6 = ii + kk * x_dim1; + x[i__6].r = 0., x[i__6].i = 0.; + } +/* Discard the local scale factors. */ + i__5 = nba; + for (ii = 1; ii <= i__5; ++ii) { + work[ii + kk * lds] = 1.; + } + scaloc = 1.; + } + } + scaloc *= work[j + kk * lds]; + work[j + kk * lds] = scaloc; + } + +/* Linear block updates */ + + if (notran) { + if (upper) { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } else { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } + } else { + if (upper) { + ifirst = j + 1; + ilast = nba; + iinc = 1; + } else { + ifirst = j - 1; + ilast = 1; + iinc = -1; + } + } + + i__4 = ilast; + i__5 = iinc; + for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += + i__5) { +/* I1: row index of the first column in X( I, K ) */ +/* I2: row index of the first column in X( I+1, K ) */ +/* so the I2 - I1 is the row count of the block X( I, K ) */ + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__6 = i__ * nb; + i2 = f2cmin(i__6,*n) + 1; + +/* Prepare the linear update to be executed with GEMM. */ +/* For each column, compute a consistent scaling, a */ +/* scaling factor to survive the linear update, and */ +/* rescale the column segments, if necesssary. Then */ +/* the linear update is safely executed. */ + + i__6 = k2 - k1; + for (kk = 1; kk <= i__6; ++kk) { + rhs = k1 + kk - 1; +/* Compute consistent scaling */ +/* Computing MIN */ + d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds]; + scamin = f2cmin(d__1,d__2); + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__7 = i2 - i1; + bnrm = zlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], + ldx, w); + bnrm *= scamin / work[i__ + kk * lds]; + xnrm[kk - 1] *= scamin / work[j + kk * lds]; + anrm = work[awrk + i__ + (j - 1) * nba]; + scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm); + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to X( I, KK ) and X( J, KK ). */ + + scal = scamin / work[i__ + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = i2 - i1; + zdscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1); + work[i__ + kk * lds] = scamin * scaloc; + } + + scal = scamin / work[j + kk * lds] * scaloc; + if (scal != 1.) { + i__7 = j2 - j1; + zdscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1); + work[j + kk * lds] = scamin * scaloc; + } + } + + if (notran) { + +/* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__6, &i__7, &i__8, &z__1, &a[i1 + j1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else if (lsame_(trans, "T")) { + +/* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("T", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } else { + +/* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */ + + i__6 = i2 - i1; + i__7 = k2 - k1; + i__8 = j2 - j1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * + a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, & + x[i1 + k1 * x_dim1], ldx); + } + } + } + +/* Reduce local scaling factors */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { +/* Computing MIN */ + d__1 = scale[rhs], d__2 = work[i__ + kk * lds]; + scale[rhs] = f2cmin(d__1,d__2); + } + } + +/* Realize consistent scaling */ + + i__3 = k2 - k1; + for (kk = 1; kk <= i__3; ++kk) { + rhs = k1 + kk - 1; + if (scale[rhs] != 1. && scale[rhs] != 0.) { + i__2 = nba; + for (i__ = 1; i__ <= i__2; ++i__) { + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__5 = i__ * nb; + i2 = f2cmin(i__5,*n) + 1; + scal = scale[rhs] / work[i__ + kk * lds]; + if (scal != 1.) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1); + } + } + } + } + } + return; + +/* End of ZLATRS3 */ + +} /* zlatrs3_ */ + diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f new file mode 100644 index 0000000000..fc1be0517a --- /dev/null +++ b/lapack-netlib/SRC/zlatrs3.f @@ -0,0 +1,667 @@ +*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow. +* +* Definition: +* =========== +* +* SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, +* X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) +* +* .. Scalar Arguments .. +* CHARACTER DIAG, NORMIN, TRANS, UPLO +* INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. +* DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* COMPLEX*16 A( LDA, * ), X( LDX, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLATRS3 solves one of the triangular systems +*> +*> A * X = B * diag(scale), A**T * X = B * diag(scale), or +*> A**H * X = B * diag(scale) +*> +*> with scaling to prevent overflow. Here A is an upper or lower +*> triangular matrix, A**T denotes the transpose of A, A**H denotes the +*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale +*> is an nrhs-element vector of scaling factors. A scaling factor scale(j) +*> is usually less than or equal to 1, chosen such that X(:,j) is less +*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0 +*> for some j), then a non-trivial solution to A*X = 0 is returned. If +*> the system is so badly scaled that the solution cannot be represented +*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. +*> +*> This is a BLAS-3 version of LATRS for solving several right +*> hand sides simultaneously. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> Specifies whether the matrix A is upper or lower triangular. +*> = 'U': Upper triangular +*> = 'L': Lower triangular +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> Specifies the operation applied to A. +*> = 'N': Solve A * x = s*b (No transpose) +*> = 'T': Solve A**T* x = s*b (Transpose) +*> = 'C': Solve A**T* x = s*b (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> Specifies whether or not the matrix A is unit triangular. +*> = 'N': Non-unit triangular +*> = 'U': Unit triangular +*> \endverbatim +*> +*> \param[in] NORMIN +*> \verbatim +*> NORMIN is CHARACTER*1 +*> Specifies whether CNORM has been set or not. +*> = 'Y': CNORM contains the column norms on entry +*> = 'N': CNORM is not set on entry. On exit, the norms will +*> be computed and stored in CNORM. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix A. N >= 0. +*> \endverbatim +*> +*> \param[in] NRHS +*> \verbatim +*> NRHS is INTEGER +*> The number of columns of X. NRHS >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> The triangular matrix A. If UPLO = 'U', the leading n by n +*> upper triangular part of the array A contains the upper +*> triangular matrix, and the strictly lower triangular part of +*> A is not referenced. If UPLO = 'L', the leading n by n lower +*> triangular part of the array A contains the lower triangular +*> matrix, and the strictly upper triangular part of A is not +*> referenced. If DIAG = 'U', the diagonal elements of A are +*> also not referenced and are assumed to be 1. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max (1,N). +*> \endverbatim +*> +*> \param[in,out] X +*> \verbatim +*> X is COMPLEX*16 array, dimension (LDX,NRHS) +*> On entry, the right hand side B of the triangular system. +*> On exit, X is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDX +*> \verbatim +*> LDX is INTEGER +*> The leading dimension of the array X. LDX >= max (1,N). +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION array, dimension (NRHS) +*> The scaling factor s(k) is for the triangular system +*> A * x(:,k) = s(k)*b(:,k) or A**T* x(:,k) = s(k)*b(:,k). +*> If SCALE = 0, the matrix A is singular or badly scaled. +*> If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) +*> that is an exact or approximate solution to A*x(:,k) = 0 +*> is returned. If the system so badly scaled that solution +*> cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 +*> is returned. +*> \endverbatim +*> +*> \param[in,out] CNORM +*> \verbatim +*> CNORM is DOUBLE PRECISION array, dimension (N) +*> +*> If NORMIN = 'Y', CNORM is an input argument and CNORM(j) +*> contains the norm of the off-diagonal part of the j-th column +*> of A. If TRANS = 'N', CNORM(j) must be greater than or equal +*> to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) +*> must be greater than or equal to the 1-norm. +*> +*> If NORMIN = 'N', CNORM is an output argument and CNORM(j) +*> returns the 1-norm of the offdiagonal part of the j-th column +*> of A. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, dimension (LWORK). +*> On exit, if INFO = 0, WORK(1) returns the optimal size of +*> WORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> LWORK is INTEGER +*> LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where +*> NBA = (N + NB - 1)/NB and NB is the optimal block size. +*> +*> If LWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the WORK array, returns +*> this value as the first entry of the WORK array, and no error +*> message related to LWORK is issued by XERBLA. +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -k, the k-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +*> \par Further Details: +* ===================== +* \verbatim +* The algorithm follows the structure of a block triangular solve. +* The diagonal block is solved with a call to the robust the triangular +* solver LATRS for every right-hand side RHS = 1, ..., NRHS +* op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), +* where op( A ) = A or op( A ) = A**T or op( A ) = A**H. +* The linear block updates operate on block columns of X, +* B( I, K ) - op(A( I, J )) * X( J, K ) +* and use GEMM. To avoid overflow in the linear block update, the worst case +* growth is estimated. For every RHS, a scale factor s <= 1.0 is computed +* such that +* || s * B( I, RHS )||_oo +* + || op(A( I, J )) ||_oo * || s * X( J, RHS ) ||_oo <= Overflow threshold +* +* Once all columns of a block column have been rescaled (BLAS-1), the linear +* update is executed with GEMM without overflow. +* +* To limit rescaling, local scale factors track the scaling of column segments. +* There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA +* per right-hand side column RHS = 1, ..., NRHS. The global scale factor +* SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) +* I = 1, ..., NBA. +* A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) +* updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The +* linear update of potentially inconsistently scaled vector segments +* s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) +* computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, +* if necessary, rescales the blocks prior to calling GEMM. +* +* \endverbatim +* ===================================================================== +* References: +* C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). +* Parallel robust solution of triangular linear systems. Concurrency +* and Computation: Practice and Experience, 31(19), e5064. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, + $ X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER DIAG, TRANS, NORMIN, UPLO + INTEGER INFO, LDA, LWORK, LDX, N, NRHS +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( LDX, * ) + DOUBLE PRECISION CNORM( * ), SCALE( * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + COMPLEX*16 CZERO, CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) + PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ) ) + INTEGER NBMAX, NBMIN, NBRHS, NRHSMIN + PARAMETER ( NRHSMIN = 2, NBRHS = 32 ) + PARAMETER ( NBMIN = 8, NBMAX = 64 ) +* .. +* .. Local Arrays .. + DOUBLE PRECISION W( NBMAX ), XNRM( NBRHS ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY, NOTRAN, NOUNIT, UPPER + INTEGER AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J, + $ JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2, + $ LANRM, LDS, LSCALE, NB, NBA, NBX, RHS + DOUBLE PRECISION ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC, + $ SCAMIN, SMLNUM, TMAX +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, ZLANGE, DLARMM + EXTERNAL ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM +* .. +* .. External Subroutines .. + EXTERNAL ZLATRS, ZDSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. +* .. Executable Statements .. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOTRAN = LSAME( TRANS, 'N' ) + NOUNIT = LSAME( DIAG, 'N' ) + LQUERY = ( LWORK.EQ.-1 ) +* +* Partition A and X into blocks. +* + NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) ) + NB = MIN( NBMAX, NB ) + NBA = MAX( 1, (N + NB - 1) / NB ) + NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS ) +* +* Compute the workspace +* +* The workspace comprises two parts. +* The first part stores the local scale factors. Each simultaneously +* computed right-hand side requires one local scale factor per block +* row. WORK( I + KK * LDS ) is the scale factor of the vector +* segment associated with the I-th block row and the KK-th vector +* in the block column. + LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) ) + LDS = NBA +* The second part stores upper bounds of the triangular A. There are +* a total of NBA x NBA blocks, of which only the upper triangular +* part or the lower triangular part is referenced. The upper bound of +* the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). + LANRM = NBA * NBA + AWRK = LSCALE + WORK( 1 ) = LSCALE + LANRM +* +* Test the input parameters. +* + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -2 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -3 + ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. + $ LSAME( NORMIN, 'N' ) ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -6 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -8 + ELSE IF( LDX.LT.MAX( 1, N ) ) THEN + INFO = -10 + ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN + INFO = -14 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLATRS3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Initialize scaling factors +* + DO KK = 1, NRHS + SCALE( KK ) = ONE + END DO +* +* Quick return if possible +* + IF( MIN( N, NRHS ).EQ.0 ) + $ RETURN +* +* Determine machine dependent constant to control overflow. +* + BIGNUM = DLAMCH( 'Overflow' ) + SMLNUM = DLAMCH( 'Safe Minimum' ) +* +* Use unblocked code for small problems +* + IF( NRHS.LT.NRHSMIN ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1), + $ SCALE( 1 ), CNORM, INFO ) + DO K = 2, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Compute norms of blocks of A excluding diagonal blocks and find +* the block with the largest norm TMAX. +* + TMAX = ZERO + DO J = 1, NBA + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 + IF ( UPPER ) THEN + IFIRST = 1 + ILAST = J - 1 + ELSE + IFIRST = J + 1 + ILAST = NBA + END IF + DO I = IFIRST, ILAST + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Compute upper bound of A( I1:I2-1, J1:J2-1 ). +* + IF( NOTRAN ) THEN + ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + I+(J-1)*NBA ) = ANRM + ELSE + ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W ) + WORK( AWRK + J+(I-1) * NBA ) = ANRM + END IF + TMAX = MAX( TMAX, ANRM ) + END DO + END DO +* + IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN +* +* Some matrix entries have huge absolute value. At least one upper +* bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point +* number, either due to overflow in LANGE or due to Inf in A. +* Fall back to LATRS. Set normin = 'N' for every right-hand side to +* force computation of TSCAL in LATRS to avoid the likely overflow +* in the computation of the column norms CNORM. +* + DO K = 1, NRHS + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ), + $ SCALE( K ), CNORM, INFO ) + END DO + RETURN + END IF +* +* Every right-hand side requires workspace to store NBA local scale +* factors. To save workspace, X is computed successively in block columns +* of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient +* workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. + DO K = 1, NBX +* Loop over block columns (index = K) of X and, for column-wise scalings, +* over individual columns (index = KK). +* K1: column index of the first column in X( J, K ) +* K2: column index of the first column in X( J, K+1 ) +* so the K2 - K1 is the column count of the block X( J, K ) + K1 = (K-1)*NBRHS + 1 + K2 = MIN( K*NBRHS, NRHS ) + 1 +* +* Initialize local scaling factors of current block column X( J, K ) +* + DO KK = 1, K2 - K1 + DO I = 1, NBA + WORK( I+KK*LDS ) = ONE + END DO + END DO +* + IF( NOTRAN ) THEN +* +* Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* + IF( UPPER ) THEN + JFIRST = NBA + JLAST = 1 + JINC = -1 + ELSE + JFIRST = 1 + JLAST = NBA + JINC = 1 + END IF + ELSE +* +* Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) +* where op(A) = A**T or op(A) = A**H +* + IF( UPPER ) THEN + JFIRST = 1 + JLAST = NBA + JINC = 1 + ELSE + JFIRST = NBA + JLAST = 1 + JINC = -1 + END IF + END IF + + DO J = JFIRST, JLAST, JINC +* J1: row index of the first row in A( J, J ) +* J2: row index of the first row in A( J+1, J+1 ) +* so that J2 - J1 is the row count of the block A( J, J ) + J1 = (J-1)*NB + 1 + J2 = MIN( J*NB, N ) + 1 +* +* Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( KK.EQ.1 ) THEN + CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + ELSE + CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1, + $ A( J1, J1 ), LDA, X( J1, RHS ), + $ SCALOC, CNORM, INFO ) + END IF +* Find largest absolute value entry in the vector segment +* X( J1:J2-1, RHS ) as an upper bound for the worst case +* growth in the linear updates. + XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ), + $ LDX, W ) +* + IF( SCALOC .EQ. ZERO ) THEN +* LATRS found that A is singular through A(j,j) = 0. +* Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 +* and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is +* set by LATRS. + SCALE( RHS ) = ZERO + DO II = 1, J1-1 + X( II, KK ) = CZERO + END DO + DO II = J2, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN +* LATRS computed a valid scale factor, but combined with +* the current scaling the solution does not have a +* scale factor > 0. +* +* Set WORK( J+KK*LDS ) to smallest valid scale +* factor and increase SCALOC accordingly. + SCAL = WORK( J+KK*LDS ) / SMLNUM + SCALOC = SCALOC * SCAL + WORK( J+KK*LDS ) = SMLNUM +* If LATRS overestimated the growth, x may be +* rescaled to preserve a valid combined scale +* factor WORK( J, KK ) > 0. + RSCAL = ONE / SCALOC + IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN + XNRM( KK ) = XNRM( KK ) * RSCAL + CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 ) + SCALOC = ONE + ELSE +* The system op(A) * x = b is badly scaled and its +* solution cannot be represented as (1/scale) * x. +* Set x to zero. This approach deviates from LATRS +* where a completely meaningless non-zero vector +* is returned that is not a solution to op(A) * x = b. + SCALE( RHS ) = ZERO + DO II = 1, N + X( II, KK ) = CZERO + END DO +* Discard the local scale factors. + DO II = 1, NBA + WORK( II+KK*LDS ) = ONE + END DO + SCALOC = ONE + END IF + END IF + SCALOC = SCALOC * WORK( J+KK*LDS ) + WORK( J+KK*LDS ) = SCALOC + END DO +* +* Linear block updates +* + IF( NOTRAN ) THEN + IF( UPPER ) THEN + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + ELSE + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + END IF + ELSE + IF( UPPER ) THEN + IFIRST = J + 1 + ILAST = NBA + IINC = 1 + ELSE + IFIRST = J - 1 + ILAST = 1 + IINC = -1 + END IF + END IF +* + DO I = IFIRST, ILAST, IINC +* I1: row index of the first column in X( I, K ) +* I2: row index of the first column in X( I+1, K ) +* so the I2 - I1 is the row count of the block X( I, K ) + I1 = (I-1)*NB + 1 + I2 = MIN( I*NB, N ) + 1 +* +* Prepare the linear update to be executed with GEMM. +* For each column, compute a consistent scaling, a +* scaling factor to survive the linear update, and +* rescale the column segments, if necesssary. Then +* the linear update is safely executed. +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 +* Compute consistent scaling + SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) ) +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W ) + BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) ) + XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) ) + ANRM = WORK( AWRK + I+(J-1)*NBA ) + SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM ) +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to X( I, KK ) and X( J, KK ). +* + SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + WORK( I+KK*LDS ) = SCAMIN*SCALOC + END IF +* + SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC + IF( SCAL.NE.ONE ) THEN + CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 ) + WORK( J+KK*LDS ) = SCAMIN*SCALOC + END IF + END DO +* + IF( NOTRAN ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) +* + CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( I1, J1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE IF( LSAME( TRANS, 'T' ) ) THEN +* +* B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) +* + CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + ELSE +* +* B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) +* + CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE, + $ A( J1, I1 ), LDA, X( J1, K1 ), LDX, + $ CONE, X( I1, K1 ), LDX ) + END IF + END DO + END DO + +* +* Reduce local scaling factors +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + DO I = 1, NBA + SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) ) + END DO + END DO +* +* Realize consistent scaling +* + DO KK = 1, K2 - K1 + RHS = K1 + KK - 1 + IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN + DO I = 1, NBA + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, N ) + 1 + SCAL = SCALE( RHS ) / WORK( I+KK*LDS ) + IF( SCAL.NE.ONE ) + $ CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 ) + END DO + END IF + END DO + END DO + RETURN +* +* End of ZLATRS3 +* + END diff --git a/lapack-netlib/SRC/zlatrz.c b/lapack-netlib/SRC/zlatrz.c index 870dae0ed5..69be8eaa8f 100644 --- a/lapack-netlib/SRC/zlatrz.c +++ b/lapack-netlib/SRC/zlatrz.c @@ -649,7 +649,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatrz_(integer *m, integer *n, integer *l, +/* Subroutine */ void zlatrz_(integer *m, integer *n, integer *l, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work) { @@ -660,7 +660,7 @@ f"> */ /* Local variables */ integer i__; doublecomplex alpha; - extern /* Subroutine */ int zlarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void zlarz_(char *, integer *, integer *, integer * , doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zlarfg_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), @@ -687,7 +687,7 @@ f"> */ /* Function Body */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -695,7 +695,7 @@ f"> */ tau[i__2].r = 0., tau[i__2].i = 0.; /* L10: */ } - return 0; + return; } for (i__ = *m; i__ >= 1; --i__) { @@ -727,7 +727,7 @@ f"> */ /* L20: */ } - return 0; + return; /* End of ZLATRZ */ diff --git a/lapack-netlib/SRC/zlatsqr.c b/lapack-netlib/SRC/zlatsqr.c index 4c95d337df..7e98a62037 100644 --- a/lapack-netlib/SRC/zlatsqr.c +++ b/lapack-netlib/SRC/zlatsqr.c @@ -673,7 +673,7 @@ static integer c__0 = 0; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void zlatsqr_(integer *m, integer *n, integer *mb, integer * nb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *lwork, integer *info) { @@ -682,11 +682,12 @@ static integer c__0 = 0; /* Local variables */ integer i__, ii, kk; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgeqrt_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgeqrt_( integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical lquery; - extern /* Subroutine */ int ztpqrt_(integer *, integer *, integer *, + extern /* Subroutine */ void ztpqrt_(integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ctr; @@ -739,15 +740,15 @@ static integer c__0 = 0; if (*info != 0) { i__1 = -(*info); xerbla_("ZLATSQR", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* The QR Decomposition */ @@ -755,7 +756,7 @@ static integer c__0 = 0; if (*mb <= *n || *mb >= *m) { zgeqrt_(m, n, nb, &a[a_offset], lda, &t[t_offset], ldt, &work[1], info); - return 0; + return; } kk = (*m - *n) % (*mb - *n); ii = *m - kk + 1; @@ -787,7 +788,7 @@ static integer c__0 = 0; i__2 = *n * *nb; work[1].r = (doublereal) i__2, work[1].i = 0.; - return 0; + return; /* End of ZLATSQR */ diff --git a/lapack-netlib/SRC/zlaunhr_col_getrfnp.c b/lapack-netlib/SRC/zlaunhr_col_getrfnp.c index e052235961..ed22ca82ec 100644 --- a/lapack-netlib/SRC/zlaunhr_col_getrfnp.c +++ b/lapack-netlib/SRC/zlaunhr_col_getrfnp.c @@ -661,7 +661,7 @@ _col_getrfnp.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlaunhr_col_getrfnp_(integer *m, integer *n, +/* Subroutine */ void zlaunhr_col_getrfnp_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *d__, integer *info) { /* System generated locals */ @@ -669,10 +669,10 @@ _col_getrfnp.f"> */ doublecomplex z__1; /* Local variables */ - extern /* Subroutine */ int zlaunhr_col_getrfnp2_(integer *, integer *, + extern /* Subroutine */ void zlaunhr_col_getrfnp2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer j, iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, @@ -713,13 +713,13 @@ _col_getrfnp.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAUNHR_COL_GETRFNP", &i__1, (ftnlen)19); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -771,7 +771,7 @@ _col_getrfnp.f"> */ } } } - return 0; + return; /* End of ZLAUNHR_COL_GETRFNP */ diff --git a/lapack-netlib/SRC/zlaunhr_col_getrfnp2.c b/lapack-netlib/SRC/zlaunhr_col_getrfnp2.c index b2c9989c98..acf2670245 100644 --- a/lapack-netlib/SRC/zlaunhr_col_getrfnp2.c +++ b/lapack-netlib/SRC/zlaunhr_col_getrfnp2.c @@ -682,7 +682,7 @@ _col_getrfnp2.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zlaunhr_col_getrfnp2_(integer *m, integer *n, +/* Subroutine */ void zlaunhr_col_getrfnp2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *d__, integer *info) { /* System generated locals */ @@ -693,13 +693,13 @@ _col_getrfnp2.f"> */ /* Local variables */ integer i__, iinfo; doublereal sfmin; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer n1, n2; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); @@ -735,13 +735,13 @@ _col_getrfnp2.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAUNHR_COL_GETRFNP2", &i__1, (ftnlen)20); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } if (*m == 1) { @@ -844,7 +844,7 @@ _col_getrfnp2.f"> */ lda, &d__[n1 + 1], &iinfo); } - return 0; + return; /* End of ZLAUNHR_COL_GETRFNP2 */ diff --git a/lapack-netlib/SRC/zlauu2.c b/lapack-netlib/SRC/zlauu2.c index 434b278b5e..9a8dbea6d6 100644 --- a/lapack-netlib/SRC/zlauu2.c +++ b/lapack-netlib/SRC/zlauu2.c @@ -618,7 +618,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlauu2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zlauu2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -631,11 +631,12 @@ f"> */ extern logical lsame_(char *, char *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublereal aii; @@ -670,13 +671,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAUU2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -742,7 +743,7 @@ f"> */ } } - return 0; + return; /* End of ZLAUU2 */ diff --git a/lapack-netlib/SRC/zlauum.c b/lapack-netlib/SRC/zlauum.c index 71d003eb40..b8331beee1 100644 --- a/lapack-netlib/SRC/zlauum.c +++ b/lapack-netlib/SRC/zlauum.c @@ -619,7 +619,7 @@ f"> */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zlauum_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zlauum_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -628,19 +628,20 @@ f"> */ /* Local variables */ integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer ib, nb; - extern /* Subroutine */ int zlauu2_(char *, integer *, doublecomplex *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void zlauu2_(char *, integer *, doublecomplex *, + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -674,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZLAUUM", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -758,7 +759,7 @@ f"> */ } } - return 0; + return; /* End of ZLAUUM */ diff --git a/lapack-netlib/SRC/zpbcon.c b/lapack-netlib/SRC/zpbcon.c index 228ab1cbef..36d2dac743 100644 --- a/lapack-netlib/SRC/zpbcon.c +++ b/lapack-netlib/SRC/zpbcon.c @@ -645,7 +645,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpbcon_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zpbcon_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *anorm, doublereal * rcond, doublecomplex *work, doublereal *rwork, integer *info) { @@ -659,7 +659,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -667,7 +667,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatbs_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *), zdrscl_(integer *, doublereal *, doublecomplex *, integer *); @@ -710,7 +710,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -718,9 +718,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -781,7 +781,7 @@ f"> */ L20: - return 0; + return; /* End of ZPBCON */ diff --git a/lapack-netlib/SRC/zpbequ.c b/lapack-netlib/SRC/zpbequ.c index 386cf5543f..eecf4b2f82 100644 --- a/lapack-netlib/SRC/zpbequ.c +++ b/lapack-netlib/SRC/zpbequ.c @@ -639,7 +639,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpbequ_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zpbequ_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { @@ -687,7 +687,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -695,7 +695,7 @@ f"> */ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } if (upper) { @@ -734,7 +734,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L20: */ } @@ -753,7 +753,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of ZPBEQU */ diff --git a/lapack-netlib/SRC/zpbrfs.c b/lapack-netlib/SRC/zpbrfs.c index acd0e1f80e..7e1166d9d0 100644 --- a/lapack-netlib/SRC/zpbrfs.c +++ b/lapack-netlib/SRC/zpbrfs.c @@ -702,7 +702,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpbrfs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void zpbrfs_(char *uplo, integer *n, integer *kd, integer * nrhs, doublecomplex *ab, integer *ldab, doublecomplex *afb, integer * ldafb, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal * @@ -721,12 +721,12 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3]; - extern /* Subroutine */ int zhbmv_(char *, integer *, integer *, + extern /* Subroutine */ void zhbmv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer count; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlacn2_( integer *, doublecomplex *, doublecomplex *, doublereal *, @@ -737,7 +737,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zpbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zpbtrs_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -795,7 +795,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -807,7 +807,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1051,7 +1051,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZPBRFS */ diff --git a/lapack-netlib/SRC/zpbstf.c b/lapack-netlib/SRC/zpbstf.c index e4002b19f1..c46dcb8c35 100644 --- a/lapack-netlib/SRC/zpbstf.c +++ b/lapack-netlib/SRC/zpbstf.c @@ -667,7 +667,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpbstf_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zpbstf_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -675,13 +675,14 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ void zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer j, m; extern logical lsame_(char *, char *); logical upper; integer km; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublereal ajj; @@ -719,13 +720,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBSTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -871,11 +872,11 @@ f"> */ /* L40: */ } } - return 0; + return; L50: *info = j; - return 0; + return; /* End of ZPBSTF */ diff --git a/lapack-netlib/SRC/zpbsv.c b/lapack-netlib/SRC/zpbsv.c index 5dae2eda22..a9a19b7143 100644 --- a/lapack-netlib/SRC/zpbsv.c +++ b/lapack-netlib/SRC/zpbsv.c @@ -673,7 +673,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpbsv_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void zpbsv_(char *uplo, integer *n, integer *kd, integer * nrhs, doublecomplex *ab, integer *ldab, doublecomplex *b, integer * ldb, integer *info) { @@ -682,7 +682,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zpbtrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zpbtrf_( char *, integer *, integer *, doublecomplex *, integer *, integer *), zpbtrs_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -725,7 +726,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**H *U or A = L*L**H. */ @@ -739,7 +740,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ info); } - return 0; + return; /* End of ZPBSV */ diff --git a/lapack-netlib/SRC/zpbsvx.c b/lapack-netlib/SRC/zpbsvx.c index 8d61aca138..83b9238833 100644 --- a/lapack-netlib/SRC/zpbsvx.c +++ b/lapack-netlib/SRC/zpbsvx.c @@ -853,7 +853,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpbsvx_(char *fact, char *uplo, integer *n, integer *kd, +/* Subroutine */ void zpbsvx_(char *fact, char *uplo, integer *n, integer *kd, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex *afb, integer *ldafb, char *equed, doublereal *s, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal * @@ -873,7 +873,7 @@ f"> */ doublereal scond, anorm; logical equil, rcequ, upper; integer j1, j2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); logical nofact; @@ -881,11 +881,11 @@ f"> */ extern doublereal zlanhb_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zlaqhb_(char *, integer *, integer *, + extern /* Subroutine */ void zlaqhb_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, char *); integer infequ; - extern /* Subroutine */ int zpbcon_(char *, integer *, integer *, + extern /* Subroutine */ void zpbcon_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex * @@ -898,7 +898,7 @@ f"> */ integer *), zpbtrf_(char *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal smlnum; - extern /* Subroutine */ int zpbtrs_(char *, integer *, integer *, integer + extern /* Subroutine */ void zpbtrs_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -998,7 +998,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -1069,7 +1069,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -1125,7 +1125,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZPBSVX */ diff --git a/lapack-netlib/SRC/zpbtf2.c b/lapack-netlib/SRC/zpbtf2.c index 3b5c345457..f050b96021 100644 --- a/lapack-netlib/SRC/zpbtf2.c +++ b/lapack-netlib/SRC/zpbtf2.c @@ -657,7 +657,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpbtf2_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zpbtf2_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -665,13 +665,14 @@ f"> */ doublereal d__1; /* Local variables */ - extern /* Subroutine */ int zher_(char *, integer *, doublereal *, + extern /* Subroutine */ void zher_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *); integer j; extern logical lsame_(char *, char *); logical upper; integer kn; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublereal ajj; @@ -709,13 +710,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Computing MAX */ @@ -793,11 +794,11 @@ f"> */ /* L20: */ } } - return 0; + return; L30: *info = j; - return 0; + return; /* End of ZPBTF2 */ diff --git a/lapack-netlib/SRC/zpbtrf.c b/lapack-netlib/SRC/zpbtrf.c index 04fa2320cc..268d1c1017 100644 --- a/lapack-netlib/SRC/zpbtrf.c +++ b/lapack-netlib/SRC/zpbtrf.c @@ -660,7 +660,7 @@ f"> */ /* > Peter Mayes and Giuseppe Radicati, IBM ECSEC, Rome, March 23, 1989 */ /* ===================================================================== */ -/* Subroutine */ int zpbtrf_(char *uplo, integer *n, integer *kd, +/* Subroutine */ void zpbtrf_(char *uplo, integer *n, integer *kd, doublecomplex *ab, integer *ldab, integer *info) { /* System generated locals */ @@ -671,21 +671,22 @@ f"> */ doublecomplex work[1056] /* was [33][32] */; integer i__, j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); integer i2, i3; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zpbtf2_(char *, integer *, integer *, doublecomplex *, integer *, integer *); integer ib, nb, ii, jj; extern /* Subroutine */ int zpotf2_(char *, integer *, doublecomplex *, - integer *, integer *), xerbla_(char *, integer *, ftnlen); + integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -720,13 +721,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment */ @@ -1035,10 +1036,10 @@ f"> */ } } } - return 0; + return; L150: - return 0; + return; /* End of ZPBTRF */ diff --git a/lapack-netlib/SRC/zpbtrs.c b/lapack-netlib/SRC/zpbtrs.c index 9bbda7bb58..3e8d2cf798 100644 --- a/lapack-netlib/SRC/zpbtrs.c +++ b/lapack-netlib/SRC/zpbtrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpbtrs_(char *uplo, integer *n, integer *kd, integer * +/* Subroutine */ void zpbtrs_(char *uplo, integer *n, integer *kd, integer * nrhs, doublecomplex *ab, integer *ldab, doublecomplex *b, integer * ldb, integer *info) { @@ -645,8 +645,8 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztbsv_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ztbsv_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -687,13 +687,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -734,7 +734,7 @@ f"> */ } } - return 0; + return; /* End of ZPBTRS */ diff --git a/lapack-netlib/SRC/zpftrf.c b/lapack-netlib/SRC/zpftrf.c index a5a3348895..f2231e3318 100644 --- a/lapack-netlib/SRC/zpftrf.c +++ b/lapack-netlib/SRC/zpftrf.c @@ -727,7 +727,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpftrf_(char *transr, char *uplo, integer *n, +/* Subroutine */ void zpftrf_(char *transr, char *uplo, integer *n, doublecomplex *a, integer *info) { /* System generated locals */ @@ -737,15 +737,15 @@ f"> */ integer k; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); logical lower; integer n1, n2; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int zpotrf_(char *, integer *, doublecomplex *, integer *, integer *); @@ -775,13 +775,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPFTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -822,7 +822,7 @@ f"> */ zpotrf_("L", &n1, a, n, info); if (*info > 0) { - return 0; + return; } ztrsm_("R", "L", "C", "N", &n2, &n1, &c_b1, a, n, &a[n1], n); zherk_("U", "N", &n2, &n1, &c_b15, &a[n1], n, &c_b16, &a[*n], @@ -840,7 +840,7 @@ f"> */ zpotrf_("L", &n1, &a[n2], n, info); if (*info > 0) { - return 0; + return; } ztrsm_("L", "L", "N", "N", &n1, &n2, &c_b1, &a[n2], n, a, n); zherk_("U", "C", &n2, &n1, &c_b15, a, n, &c_b16, &a[n1], n); @@ -863,7 +863,7 @@ f"> */ zpotrf_("U", &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } ztrsm_("L", "U", "C", "N", &n1, &n2, &c_b1, a, &n1, &a[n1 * n1], &n1); @@ -882,7 +882,7 @@ f"> */ zpotrf_("U", &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } ztrsm_("R", "U", "N", "N", &n2, &n1, &c_b1, &a[n2 * n2], &n2, a, &n2); @@ -914,7 +914,7 @@ f"> */ i__1 = *n + 1; zpotrf_("L", &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -939,7 +939,7 @@ f"> */ i__1 = *n + 1; zpotrf_("L", &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -969,7 +969,7 @@ f"> */ zpotrf_("U", &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } ztrsm_("L", "U", "C", "N", &k, &k, &c_b1, &a[k], &n1, &a[k * ( k + 1)], &k); @@ -988,7 +988,7 @@ f"> */ zpotrf_("U", &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } ztrsm_("R", "U", "N", "N", &k, &k, &c_b1, &a[k * (k + 1)], &k, a, &k); @@ -1004,7 +1004,7 @@ f"> */ } - return 0; + return; /* End of ZPFTRF */ diff --git a/lapack-netlib/SRC/zpftri.c b/lapack-netlib/SRC/zpftri.c index dc931ef70a..52ab6aff53 100644 --- a/lapack-netlib/SRC/zpftri.c +++ b/lapack-netlib/SRC/zpftri.c @@ -727,7 +727,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpftri_(char *transr, char *uplo, integer *n, +/* Subroutine */ void zpftri_(char *transr, char *uplo, integer *n, doublecomplex *a, integer *info) { /* System generated locals */ @@ -737,18 +737,19 @@ f"> */ integer k; logical normaltransr; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); logical lower; integer n1, n2; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int zlauum_(char *, integer *, doublecomplex *, - integer *, integer *), ztftri_(char *, char *, char *, + integer *, integer *); + extern void ztftri_(char *, char *, char *, integer *, doublecomplex *, integer *); @@ -776,20 +777,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ ztftri_(transr, uplo, "N", n, a, info); if (*info > 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -964,7 +965,7 @@ f"> */ } - return 0; + return; /* End of ZPFTRI */ diff --git a/lapack-netlib/SRC/zpftrs.c b/lapack-netlib/SRC/zpftrs.c index 8e7cc1fc08..26aaaa61cf 100644 --- a/lapack-netlib/SRC/zpftrs.c +++ b/lapack-netlib/SRC/zpftrs.c @@ -733,7 +733,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpftrs_(char *transr, char *uplo, integer *n, integer * +/* Subroutine */ void zpftrs_(char *transr, char *uplo, integer *n, integer * nrhs, doublecomplex *a, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -743,9 +743,10 @@ f"> */ logical normaltransr; extern logical lsame_(char *, char *); logical lower; - extern /* Subroutine */ int ztfsm_(char *, char *, char *, char *, char *, + extern /* Subroutine */ void ztfsm_(char *, char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, - doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -782,13 +783,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPFTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* start execution: there are two triangular solves */ @@ -805,7 +806,7 @@ f"> */ ldb); } - return 0; + return; /* End of ZPFTRS */ diff --git a/lapack-netlib/SRC/zpocon.c b/lapack-netlib/SRC/zpocon.c index a54c42c8a9..4eecb13160 100644 --- a/lapack-netlib/SRC/zpocon.c +++ b/lapack-netlib/SRC/zpocon.c @@ -633,7 +633,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpocon_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpocon_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *anorm, doublereal *rcond, doublecomplex * work, doublereal *rwork, integer *info) { @@ -647,7 +647,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -655,11 +655,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zdrscl_(integer *, doublereal *, + extern /* Subroutine */ void zdrscl_(integer *, doublereal *, doublecomplex *, integer *); char normin[1]; doublereal smlnum; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *); @@ -697,7 +697,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -705,9 +705,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -767,7 +767,7 @@ f"> */ } L20: - return 0; + return; /* End of ZPOCON */ diff --git a/lapack-netlib/SRC/zpoequ.c b/lapack-netlib/SRC/zpoequ.c index f8a6c8dcbb..e3fcc48920 100644 --- a/lapack-netlib/SRC/zpoequ.c +++ b/lapack-netlib/SRC/zpoequ.c @@ -622,7 +622,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpoequ_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zpoequ_(integer *n, doublecomplex *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { /* System generated locals */ @@ -662,7 +662,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -670,7 +670,7 @@ f"> */ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } /* Find the minimum and maximum diagonal elements. */ @@ -700,7 +700,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L20: */ } @@ -719,7 +719,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of ZPOEQU */ diff --git a/lapack-netlib/SRC/zpoequb.c b/lapack-netlib/SRC/zpoequb.c index 0ea3bb04c0..54c36e7721 100644 --- a/lapack-netlib/SRC/zpoequb.c +++ b/lapack-netlib/SRC/zpoequb.c @@ -628,7 +628,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpoequb_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zpoequb_(integer *n, doublecomplex *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { /* System generated locals */ @@ -672,7 +672,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOEQUB", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible. */ @@ -680,7 +680,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } base = dlamch_("B"); tmp = -.5 / log(base); @@ -713,7 +713,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L20: */ } @@ -734,7 +734,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of ZPOEQUB */ diff --git a/lapack-netlib/SRC/zporfs.c b/lapack-netlib/SRC/zporfs.c index 4685fe16e0..9b963c44a1 100644 --- a/lapack-netlib/SRC/zporfs.c +++ b/lapack-netlib/SRC/zporfs.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zporfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zporfs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal * @@ -715,11 +715,11 @@ f"> */ doublereal s; extern logical lsame_(char *, char *); integer isave[3], count; - extern /* Subroutine */ int zhemv_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zhemv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zlacn2_( integer *, doublecomplex *, doublecomplex *, doublereal *, @@ -730,7 +730,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zpotrs_(char *, integer *, integer *, + extern /* Subroutine */ void zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -785,7 +785,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPORFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -797,7 +797,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1030,7 +1030,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZPORFS */ diff --git a/lapack-netlib/SRC/zposv.c b/lapack-netlib/SRC/zposv.c index cdd420cbc5..a577d5315c 100644 --- a/lapack-netlib/SRC/zposv.c +++ b/lapack-netlib/SRC/zposv.c @@ -639,7 +639,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16POsolve */ /* ===================================================================== */ -/* Subroutine */ int zposv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zposv_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { @@ -648,9 +648,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zpotrf_( - char *, integer *, doublecomplex *, integer *, integer *), - zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int zpotrf_( + char *, integer *, doublecomplex *, integer *, integer *); + extern void zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -689,7 +690,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**H *U or A = L*L**H. */ @@ -702,7 +703,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zpotrs_(uplo, n, nrhs, &a[a_offset], lda, &b[b_offset], ldb, info); } - return 0; + return; /* End of ZPOSV */ diff --git a/lapack-netlib/SRC/zposvx.c b/lapack-netlib/SRC/zposvx.c index a2adfa4c82..10032d7582 100644 --- a/lapack-netlib/SRC/zposvx.c +++ b/lapack-netlib/SRC/zposvx.c @@ -813,7 +813,7 @@ f"> */ /* > \ingroup complex16POsolve */ /* ===================================================================== */ -/* Subroutine */ int zposvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zposvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, char *equed, doublereal *s, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, @@ -838,23 +838,24 @@ f"> */ doublereal bignum; extern doublereal zlanhe_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zlaqhe_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqhe_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, char *); integer infequ; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zpocon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *) ; doublereal smlnum; - extern /* Subroutine */ int zpoequ_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zpoequ_(integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, integer *), zporfs_( char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, - doublecomplex *, doublereal *, integer *), zpotrf_(char *, - integer *, doublecomplex *, integer *, integer *), - zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, + doublecomplex *, doublereal *, integer *); + extern int zpotrf_(char *, + integer *, doublecomplex *, integer *, integer *); + extern void zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -951,7 +952,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -997,7 +998,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -1052,7 +1053,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZPOSVX */ diff --git a/lapack-netlib/SRC/zposvxx.c b/lapack-netlib/SRC/zposvxx.c index c27adcb1b8..b6473cf983 100644 --- a/lapack-netlib/SRC/zposvxx.c +++ b/lapack-netlib/SRC/zposvxx.c @@ -999,7 +999,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16POsolve */ /* ===================================================================== */ -/* Subroutine */ int zposvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zposvxx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, char *equed, doublereal *s, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *rpvgrw, @@ -1025,13 +1025,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ logical nofact; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; - extern /* Subroutine */ int zlaqhe_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqhe_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, char *); integer infequ; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zpotrf_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zpotrf_(char *, integer *, doublecomplex *, integer *, integer *), zpotrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zlascl2_(integer *, integer *, doublereal *, @@ -1150,7 +1150,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1190,7 +1190,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = zla_porpvgrw_(uplo, n, &a[a_offset], lda, &af[ af_offset], ldaf, &rwork[1]); - return 0; + return; } } @@ -1219,7 +1219,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zlascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of ZPOSVXX */ diff --git a/lapack-netlib/SRC/zpotf2.c b/lapack-netlib/SRC/zpotf2.c index 1d84397cc1..594f338025 100644 --- a/lapack-netlib/SRC/zpotf2.c +++ b/lapack-netlib/SRC/zpotf2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpotf2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpotf2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -637,12 +637,13 @@ f"> */ extern logical lsame_(char *, char *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; extern logical disnan_(doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublereal ajj; @@ -677,13 +678,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -780,7 +781,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of ZPOTF2 */ diff --git a/lapack-netlib/SRC/zpotrf.c b/lapack-netlib/SRC/zpotrf.c index 91590a111d..99dd13bdec 100644 --- a/lapack-netlib/SRC/zpotrf.c +++ b/lapack-netlib/SRC/zpotrf.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpotrf_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpotrf_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -634,21 +634,21 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer jb, nb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zpotrf2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zpotrf2_(char *, integer *, doublecomplex *, integer *, integer *); @@ -681,13 +681,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Determine the block size for this environment. */ @@ -789,7 +789,7 @@ f"> */ *info = *info + j - 1; L40: - return 0; + return; /* End of ZPOTRF */ diff --git a/lapack-netlib/SRC/zpotrf2.c b/lapack-netlib/SRC/zpotrf2.c index e0529676c9..0b773aae6d 100644 --- a/lapack-netlib/SRC/zpotrf2.c +++ b/lapack-netlib/SRC/zpotrf2.c @@ -618,7 +618,7 @@ static doublereal c_b12 = 1.; /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpotrf2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpotrf2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -628,12 +628,12 @@ static doublereal c_b12 = 1.; /* Local variables */ extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *); logical upper; integer n1, n2; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); extern logical disnan_(doublereal *); @@ -670,13 +670,13 @@ static doublereal c_b12 = 1.; if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTRF2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* N=1 case */ @@ -689,7 +689,7 @@ static doublereal c_b12 = 1.; ajj = a[i__1].r; if (ajj <= 0. || disnan_(&ajj)) { *info = 1; - return 0; + return; } /* Factor */ @@ -709,7 +709,7 @@ static doublereal c_b12 = 1.; zpotrf2_(uplo, &n1, &a[a_dim1 + 1], lda, &iinfo); if (iinfo != 0) { *info = iinfo; - return 0; + return; } /* Compute the Cholesky factorization A = U**H*U */ @@ -728,7 +728,7 @@ static doublereal c_b12 = 1.; zpotrf2_(uplo, &n2, &a[n1 + 1 + (n1 + 1) * a_dim1], lda, &iinfo); if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } /* Compute the Cholesky factorization A = L*L**H */ @@ -747,11 +747,11 @@ static doublereal c_b12 = 1.; zpotrf2_(uplo, &n2, &a[n1 + 1 + (n1 + 1) * a_dim1], lda, &iinfo); if (iinfo != 0) { *info = iinfo + n1; - return 0; + return; } } } - return 0; + return; /* End of ZPOTRF2 */ diff --git a/lapack-netlib/SRC/zpotri.c b/lapack-netlib/SRC/zpotri.c index fdafb8368f..92b0f844bd 100644 --- a/lapack-netlib/SRC/zpotri.c +++ b/lapack-netlib/SRC/zpotri.c @@ -604,7 +604,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpotri_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpotri_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -612,9 +612,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlauum_( - char *, integer *, doublecomplex *, integer *, integer *), - ztrtri_(char *, char *, integer *, doublecomplex *, integer *, + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int zlauum_( + char *, integer *, doublecomplex *, integer *, integer *); + extern int ztrtri_(char *, char *, integer *, doublecomplex *, integer *, integer *); @@ -646,27 +647,27 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ ztrtri_(uplo, "Non-unit", n, &a[a_offset], lda, info); if (*info > 0) { - return 0; + return; } /* Form inv(U) * inv(U)**H or inv(L)**H * inv(L). */ zlauum_(uplo, n, &a[a_offset], lda, info); - return 0; + return; /* End of ZPOTRI */ diff --git a/lapack-netlib/SRC/zpotrs.c b/lapack-netlib/SRC/zpotrs.c index 8a6f598b4f..ddbe496e5a 100644 --- a/lapack-netlib/SRC/zpotrs.c +++ b/lapack-netlib/SRC/zpotrs.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16POcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpotrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zpotrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { @@ -633,10 +633,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -675,13 +675,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPOTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -712,7 +712,7 @@ f"> */ c_b1, &a[a_offset], lda, &b[b_offset], ldb); } - return 0; + return; /* End of ZPOTRS */ diff --git a/lapack-netlib/SRC/zppcon.c b/lapack-netlib/SRC/zppcon.c index cc1931f5ee..541c330afe 100644 --- a/lapack-netlib/SRC/zppcon.c +++ b/lapack-netlib/SRC/zppcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zppcon_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zppcon_(char *uplo, integer *n, doublecomplex *ap, doublereal *anorm, doublereal *rcond, doublecomplex *work, doublereal *rwork, integer *info) { @@ -645,7 +645,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -653,11 +653,11 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zdrscl_(integer *, doublereal *, + extern /* Subroutine */ void zdrscl_(integer *, doublereal *, doublecomplex *, integer *); char normin[1]; doublereal smlnum; - extern /* Subroutine */ int zlatps_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatps_(char *, char *, char *, char *, integer *, doublecomplex *, doublecomplex *, doublereal *, doublereal *, integer *); @@ -691,7 +691,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -699,9 +699,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } smlnum = dlamch_("Safe minimum"); @@ -761,7 +761,7 @@ f"> */ } L20: - return 0; + return; /* End of ZPPCON */ diff --git a/lapack-netlib/SRC/zppequ.c b/lapack-netlib/SRC/zppequ.c index 57b6e3189f..7b7d7be22f 100644 --- a/lapack-netlib/SRC/zppequ.c +++ b/lapack-netlib/SRC/zppequ.c @@ -626,7 +626,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zppequ_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zppequ_(char *uplo, integer *n, doublecomplex *ap, doublereal *s, doublereal *scond, doublereal *amax, integer *info) { /* System generated locals */ @@ -668,7 +668,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPEQU", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -676,7 +676,7 @@ f"> */ if (*n == 0) { *scond = 1.; *amax = 0.; - return 0; + return; } /* Initialize SMIN and AMAX. */ @@ -734,7 +734,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { if (s[i__] <= 0.) { *info = i__; - return 0; + return; } /* L30: */ } @@ -753,7 +753,7 @@ f"> */ *scond = sqrt(smin) / sqrt(*amax); } - return 0; + return; /* End of ZPPEQU */ diff --git a/lapack-netlib/SRC/zpprfs.c b/lapack-netlib/SRC/zpprfs.c index b079c22092..652d53cbc0 100644 --- a/lapack-netlib/SRC/zpprfs.c +++ b/lapack-netlib/SRC/zpprfs.c @@ -684,7 +684,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpprfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zpprfs_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *afp, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal *rwork, integer *info) @@ -702,7 +702,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zhpmv_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_( @@ -716,7 +716,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zpptrs_(char *, integer *, integer *, + extern /* Subroutine */ void zpptrs_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal eps; @@ -763,7 +763,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -775,7 +775,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1015,7 +1015,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZPPRFS */ diff --git a/lapack-netlib/SRC/zppsv.c b/lapack-netlib/SRC/zppsv.c index 4fe6b86553..4788368fcf 100644 --- a/lapack-netlib/SRC/zppsv.c +++ b/lapack-netlib/SRC/zppsv.c @@ -653,7 +653,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zppsv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zppsv_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -661,7 +661,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zpptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zpptrf_( char *, integer *, doublecomplex *, integer *), zpptrs_( char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); @@ -698,7 +699,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the Cholesky factorization A = U**H *U or A = L*L**H. */ @@ -711,7 +712,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zpptrs_(uplo, n, nrhs, &ap[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of ZPPSV */ diff --git a/lapack-netlib/SRC/zppsvx.c b/lapack-netlib/SRC/zppsvx.c index 448854e55f..ff1cb299e8 100644 --- a/lapack-netlib/SRC/zppsvx.c +++ b/lapack-netlib/SRC/zppsvx.c @@ -823,7 +823,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zppsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zppsvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *ap, doublecomplex *afp, char *equed, doublereal * s, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, doublecomplex * @@ -840,7 +840,7 @@ f"> */ extern logical lsame_(char *, char *); doublereal scond, anorm; logical equil, rcequ; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); logical nofact; @@ -849,14 +849,14 @@ f"> */ integer infequ; extern doublereal zlanhp_(char *, char *, integer *, doublecomplex *, doublereal *); - extern /* Subroutine */ int zlaqhp_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqhp_(char *, integer *, doublecomplex *, doublereal *, doublereal *, doublereal *, char *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zppcon_(char *, integer *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, doublereal *, integer *); doublereal smlnum; - extern /* Subroutine */ int zppequ_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zppequ_(char *, integer *, doublecomplex *, doublereal *, doublereal *, doublereal *, integer *), zpprfs_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -951,7 +951,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (equil) { @@ -998,7 +998,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -1051,7 +1051,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZPPSVX */ diff --git a/lapack-netlib/SRC/zpptrf.c b/lapack-netlib/SRC/zpptrf.c index 885e0c2b18..8e099c203c 100644 --- a/lapack-netlib/SRC/zpptrf.c +++ b/lapack-netlib/SRC/zpptrf.c @@ -633,7 +633,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zpptrf_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zpptrf_(char *uplo, integer *n, doublecomplex *ap, integer *info) { /* System generated locals */ @@ -642,17 +642,18 @@ f"> */ doublecomplex z__1, z__2; /* Local variables */ - extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *); integer j; extern logical lsame_(char *, char *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpsv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *); integer jc, jj; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); doublereal ajj; @@ -682,13 +683,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -769,7 +770,7 @@ f"> */ *info = j; L40: - return 0; + return; /* End of ZPPTRF */ diff --git a/lapack-netlib/SRC/zpptri.c b/lapack-netlib/SRC/zpptri.c index b9095bc1cd..215e343793 100644 --- a/lapack-netlib/SRC/zpptri.c +++ b/lapack-netlib/SRC/zpptri.c @@ -607,7 +607,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpptri_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zpptri_(char *uplo, integer *n, doublecomplex *ap, integer *info) { /* System generated locals */ @@ -616,17 +616,18 @@ f"> */ doublecomplex z__1; /* Local variables */ - extern /* Subroutine */ int zhpr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zhpr_(char *, integer *, doublereal *, doublecomplex *, integer *, doublecomplex *); integer j; extern logical lsame_(char *, char *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpmv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *); integer jc, jj; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), ztptri_( char *, char *, integer *, doublecomplex *, integer *); doublereal ajj; @@ -658,20 +659,20 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Invert the triangular Cholesky factor U or L. */ ztptri_(uplo, "Non-unit", n, &ap[1], info); if (*info > 0) { - return 0; + return; } if (upper) { @@ -715,7 +716,7 @@ f"> */ } } - return 0; + return; /* End of ZPPTRI */ diff --git a/lapack-netlib/SRC/zpptrs.c b/lapack-netlib/SRC/zpptrs.c index 2f1f091674..4ede1c717a 100644 --- a/lapack-netlib/SRC/zpptrs.c +++ b/lapack-netlib/SRC/zpptrs.c @@ -621,7 +621,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpptrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zpptrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ @@ -631,8 +631,9 @@ f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztpsv_(char *, char *, char *, integer *, - doublecomplex *, doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ztpsv_(char *, char *, char *, integer *, + doublecomplex *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -667,13 +668,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -714,7 +715,7 @@ f"> */ } } - return 0; + return; /* End of ZPPTRS */ diff --git a/lapack-netlib/SRC/zpstf2.c b/lapack-netlib/SRC/zpstf2.c index ad3f89b2dd..9fa691324c 100644 --- a/lapack-netlib/SRC/zpstf2.c +++ b/lapack-netlib/SRC/zpstf2.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpstf2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpstf2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *piv, integer *rank, doublereal *tol, doublereal *work, integer *info) { @@ -673,17 +673,18 @@ f"> */ extern logical lsame_(char *, char *); doublereal dtemp; integer itemp; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublereal dstop; logical upper; doublecomplex ztemp; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); extern logical disnan_(doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlacgv_( integer *, doublecomplex *, integer *); doublereal ajj; @@ -721,13 +722,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPSTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Initialize PIV */ @@ -996,7 +997,7 @@ f"> */ *info = 1; L200: - return 0; + return; /* End of ZPSTF2 */ diff --git a/lapack-netlib/SRC/zpstrf.c b/lapack-netlib/SRC/zpstrf.c index d84e7566b6..056f93b46c 100644 --- a/lapack-netlib/SRC/zpstrf.c +++ b/lapack-netlib/SRC/zpstrf.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpstrf_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zpstrf_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *piv, integer *rank, doublereal *tol, doublereal *work, integer *info) { @@ -676,7 +676,7 @@ f"> */ extern logical lsame_(char *, char *); doublereal dtemp; integer itemp; - extern /* Subroutine */ int zherk_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zherk_(char *, char *, integer *, integer *, doublereal *, doublecomplex *, integer *, doublereal *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, @@ -685,18 +685,18 @@ f"> */ doublereal dstop; logical upper; doublecomplex ztemp; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jb, nb; extern doublereal dlamch_(char *); - extern /* Subroutine */ int zpstf2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zpstf2_(char *, integer *, doublecomplex *, integer *, integer *, integer *, doublereal *, doublereal *, integer *); extern logical disnan_(doublereal *); extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *), zlacgv_(integer *, doublecomplex *, integer *); doublereal ajj; @@ -734,13 +734,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPSTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get block size */ @@ -1081,7 +1081,7 @@ f"> */ *info = 1; L230: - return 0; + return; /* End of ZPSTRF */ diff --git a/lapack-netlib/SRC/zptcon.c b/lapack-netlib/SRC/zptcon.c index d764fcf2f3..45e748a6ba 100644 --- a/lapack-netlib/SRC/zptcon.c +++ b/lapack-netlib/SRC/zptcon.c @@ -632,7 +632,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zptcon_(integer *n, doublereal *d__, doublecomplex *e, +/* Subroutine */ void zptcon_(integer *n, doublereal *d__, doublecomplex *e, doublereal *anorm, doublereal *rcond, doublereal *rwork, integer * info) { @@ -673,7 +673,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPTCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -681,9 +681,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm == 0.) { - return 0; + return; } /* Check that D(1:N) is positive. */ @@ -691,7 +691,7 @@ f"> */ i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { if (d__[i__] <= 0.) { - return 0; + return; } /* L10: */ } @@ -731,7 +731,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZPTCON */ diff --git a/lapack-netlib/SRC/zpteqr.c b/lapack-netlib/SRC/zpteqr.c index 9c88679bab..e557bcc7c6 100644 --- a/lapack-netlib/SRC/zpteqr.c +++ b/lapack-netlib/SRC/zpteqr.c @@ -661,7 +661,7 @@ f"> */ /* > \ingroup complex16PTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpteqr_(char *compz, integer *n, doublereal *d__, +/* Subroutine */ void zpteqr_(char *compz, integer *n, doublereal *d__, doublereal *e, doublecomplex *z__, integer *ldz, doublereal *work, integer *info) { @@ -675,7 +675,7 @@ f"> */ doublecomplex vt[1] /* was [1][1] */; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer icompz; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), dpttrf_(integer *, doublereal *, doublereal *, integer *) , zbdsqr_(char *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, @@ -725,13 +725,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -739,7 +739,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } if (icompz == 2) { zlaset_("Full", n, n, &c_b1, &c_b2, &z__[z_offset], ldz); @@ -749,7 +749,7 @@ f"> */ dpttrf_(n, &d__[1], &e[1], info); if (*info != 0) { - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -785,7 +785,7 @@ f"> */ *info = *n + *info; } - return 0; + return; /* End of ZPTEQR */ diff --git a/lapack-netlib/SRC/zptrfs.c b/lapack-netlib/SRC/zptrfs.c index 6f1c83b174..3bf4915774 100644 --- a/lapack-netlib/SRC/zptrfs.c +++ b/lapack-netlib/SRC/zptrfs.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complex16PTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zptrfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zptrfs_(char *uplo, integer *n, integer *nrhs, doublereal *d__, doublecomplex *e, doublereal *df, doublecomplex *ef, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal * @@ -716,7 +716,7 @@ f"> */ extern logical lsame_(char *, char *); integer count; logical upper; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex bi; extern doublereal dlamch_(char *); @@ -727,7 +727,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zpttrs_(char *, integer *, integer *, + extern /* Subroutine */ void zpttrs_(char *, integer *, integer *, doublereal *, doublecomplex *, doublecomplex *, integer *, integer *); doublereal eps; @@ -777,7 +777,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPTRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -789,7 +789,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1146,7 +1146,7 @@ f"> */ /* L100: */ } - return 0; + return; /* End of ZPTRFS */ diff --git a/lapack-netlib/SRC/zptsv.c b/lapack-netlib/SRC/zptsv.c index 96447a5547..37e8534659 100644 --- a/lapack-netlib/SRC/zptsv.c +++ b/lapack-netlib/SRC/zptsv.c @@ -624,14 +624,15 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16PTsolve */ /* ===================================================================== */ -/* Subroutine */ int zptsv_(integer *n, integer *nrhs, doublereal *d__, +/* Subroutine */ void zptsv_(integer *n, integer *nrhs, doublereal *d__, doublecomplex *e, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ integer b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zpttrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zpttrf_( integer *, doublereal *, doublecomplex *, integer *), zpttrs_( char *, integer *, integer *, doublereal *, doublecomplex *, doublecomplex *, integer *, integer *); @@ -667,7 +668,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZPTSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the L*D*L**H (or U**H*D*U) factorization of A. */ @@ -679,7 +680,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zpttrs_("Lower", n, nrhs, &d__[1], &e[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of ZPTSV */ diff --git a/lapack-netlib/SRC/zptsvx.c b/lapack-netlib/SRC/zptsvx.c index 713766cceb..3ab57d2355 100644 --- a/lapack-netlib/SRC/zptsvx.c +++ b/lapack-netlib/SRC/zptsvx.c @@ -746,7 +746,7 @@ f"> */ /* > \ingroup complex16PTsolve */ /* ===================================================================== */ -/* Subroutine */ int zptsvx_(char *fact, integer *n, integer *nrhs, +/* Subroutine */ void zptsvx_(char *fact, integer *n, integer *nrhs, doublereal *d__, doublecomplex *e, doublereal *df, doublecomplex *ef, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, doublecomplex * @@ -758,7 +758,7 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); @@ -766,7 +766,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal zlanht_(char *, integer *, doublereal *, doublecomplex * ); - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zptcon_(integer *, doublereal *, doublecomplex *, doublereal *, doublereal *, doublereal *, integer *), zptrfs_(char *, integer *, @@ -823,7 +823,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPTSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -841,7 +841,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -871,7 +871,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZPTSVX */ diff --git a/lapack-netlib/SRC/zpttrf.c b/lapack-netlib/SRC/zpttrf.c index b9d1c48aff..e2c271f904 100644 --- a/lapack-netlib/SRC/zpttrf.c +++ b/lapack-netlib/SRC/zpttrf.c @@ -601,7 +601,7 @@ f"> */ /* > \ingroup complex16PTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpttrf_(integer *n, doublereal *d__, doublecomplex *e, +/* Subroutine */ void zpttrf_(integer *n, doublereal *d__, doublecomplex *e, integer *info) { /* System generated locals */ @@ -636,13 +636,13 @@ f"> */ *info = -1; i__1 = -(*info); xerbla_("ZPTTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Compute the L*D*L**H (or U**H *D*U) factorization of A. */ @@ -749,7 +749,7 @@ f"> */ } L30: - return 0; + return; /* End of ZPTTRF */ diff --git a/lapack-netlib/SRC/zpttrs.c b/lapack-netlib/SRC/zpttrs.c index 7933361c5e..c9a0336fac 100644 --- a/lapack-netlib/SRC/zpttrs.c +++ b/lapack-netlib/SRC/zpttrs.c @@ -635,7 +635,7 @@ f"> */ /* > \ingroup complex16PTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zpttrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zpttrs_(char *uplo, integer *n, integer *nrhs, doublereal *d__, doublecomplex *e, doublecomplex *b, integer *ldb, integer *info) { @@ -646,9 +646,9 @@ f"> */ integer j, iuplo; logical upper; integer jb, nb; - extern /* Subroutine */ int zptts2_(integer *, integer *, integer *, - doublereal *, doublecomplex *, doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void zptts2_(integer *, integer *, integer *, + doublereal *, doublecomplex *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); @@ -687,13 +687,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZPTTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Determine the number of right-hand sides to solve at a time. */ @@ -729,7 +729,7 @@ f"> */ } } - return 0; + return; /* End of ZPTTRS */ diff --git a/lapack-netlib/SRC/zptts2.c b/lapack-netlib/SRC/zptts2.c index f69b0ad2ad..9b6d2fb9f9 100644 --- a/lapack-netlib/SRC/zptts2.c +++ b/lapack-netlib/SRC/zptts2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16PTcomputational */ /* ===================================================================== */ -/* Subroutine */ int zptts2_(integer *iuplo, integer *n, integer *nrhs, +/* Subroutine */ void zptts2_(integer *iuplo, integer *n, integer *nrhs, doublereal *d__, doublecomplex *e, doublecomplex *b, integer *ldb) { /* System generated locals */ @@ -633,7 +633,7 @@ f"> */ /* Local variables */ integer i__, j; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); @@ -661,7 +661,7 @@ f"> */ d__1 = 1. / d__[1]; zdscal_(nrhs, &d__1, &b[b_offset], ldb); } - return 0; + return; } if (*iuplo == 1) { @@ -860,7 +860,7 @@ f"> */ } } - return 0; + return; /* End of ZPTTS2 */ diff --git a/lapack-netlib/SRC/zrot.c b/lapack-netlib/SRC/zrot.c index 04cefe53db..da9b200374 100644 --- a/lapack-netlib/SRC/zrot.c +++ b/lapack-netlib/SRC/zrot.c @@ -613,7 +613,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zrot_(integer *n, doublecomplex *cx, integer *incx, +/* Subroutine */ void zrot_(integer *n, doublecomplex *cx, integer *incx, doublecomplex *cy, integer *incy, doublereal *c__, doublecomplex *s) { /* System generated locals */ @@ -641,7 +641,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Function Body */ if (*n <= 0) { - return 0; + return; } if (*incx == 1 && *incy == 1) { goto L20; @@ -681,7 +681,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ iy += *incy; /* L10: */ } - return 0; + return; /* Code for both increments equal to 1 */ @@ -708,6 +708,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ cx[i__2].r = stemp.r, cx[i__2].i = stemp.i; /* L30: */ } - return 0; + return; } /* zrot_ */ diff --git a/lapack-netlib/SRC/zspcon.c b/lapack-netlib/SRC/zspcon.c index a2cb64d810..21cc74cdbe 100644 --- a/lapack-netlib/SRC/zspcon.c +++ b/lapack-netlib/SRC/zspcon.c @@ -631,7 +631,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zspcon_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zspcon_(char *uplo, integer *n, doublecomplex *ap, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex * work, integer *info) { @@ -643,12 +643,12 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zsptrs_(char *, integer *, integer *, + extern /* Subroutine */ void zsptrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -681,7 +681,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -689,9 +689,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -704,7 +704,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = ip; if (ipiv[i__] > 0 && (ap[i__1].r == 0. && ap[i__1].i == 0.)) { - return 0; + return; } ip -= i__; /* L10: */ @@ -718,7 +718,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = ip; if (ipiv[i__] > 0 && (ap[i__2].r == 0. && ap[i__2].i == 0.)) { - return 0; + return; } ip = ip + *n - i__ + 1; /* L20: */ @@ -744,7 +744,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZSPCON */ diff --git a/lapack-netlib/SRC/zspmv.c b/lapack-netlib/SRC/zspmv.c index 3fe5eb7510..e0bd139b5b 100644 --- a/lapack-netlib/SRC/zspmv.c +++ b/lapack-netlib/SRC/zspmv.c @@ -661,7 +661,7 @@ rix */ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zspmv_(char *uplo, integer *n, doublecomplex *alpha, +/* Subroutine */ void zspmv_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x, integer *incx, doublecomplex * beta, doublecomplex *y, integer *incy) { @@ -707,14 +707,14 @@ rix */ } if (info != 0) { xerbla_("ZSPMV ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.)) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -782,7 +782,7 @@ rix */ } } if (alpha->r == 0. && alpha->i == 0.) { - return 0; + return; } kk = 1; if (lsame_(uplo, "U")) { @@ -983,7 +983,7 @@ rix */ } } - return 0; + return; /* End of ZSPMV */ diff --git a/lapack-netlib/SRC/zspr.c b/lapack-netlib/SRC/zspr.c index 2e7b5e871f..9aa4b9b039 100644 --- a/lapack-netlib/SRC/zspr.c +++ b/lapack-netlib/SRC/zspr.c @@ -641,7 +641,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zspr_(char *uplo, integer *n, doublecomplex *alpha, +/* Subroutine */ void zspr_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *x, integer *incx, doublecomplex *ap) { /* System generated locals */ @@ -683,13 +683,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("ZSPR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0. && alpha->i == 0.) { - return 0; + return; } /* Set the start point in X if the increment is not unity. */ @@ -884,7 +884,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of ZSPR */ diff --git a/lapack-netlib/SRC/zsprfs.c b/lapack-netlib/SRC/zsprfs.c index d3372722f5..2a8b995824 100644 --- a/lapack-netlib/SRC/zsprfs.c +++ b/lapack-netlib/SRC/zsprfs.c @@ -693,7 +693,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsprfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsprfs_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *afp, integer *ipiv, doublecomplex * b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal *rwork, integer * @@ -712,7 +712,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zspmv_( char *, integer *, doublecomplex *, doublecomplex *, @@ -726,7 +726,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zsptrs_(char *, integer *, integer *, + extern /* Subroutine */ void zsptrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -774,7 +774,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -786,7 +786,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1026,7 +1026,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZSPRFS */ diff --git a/lapack-netlib/SRC/zspsv.c b/lapack-netlib/SRC/zspsv.c index 4798081b28..31c89538e5 100644 --- a/lapack-netlib/SRC/zspsv.c +++ b/lapack-netlib/SRC/zspsv.c @@ -671,7 +671,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zspsv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zspsv_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -680,7 +680,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zsptrf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zsptrf_( char *, integer *, doublecomplex *, integer *, integer *), zsptrs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -718,7 +719,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPSV ", &i__1, (ftnlen)6); - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -731,7 +732,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zsptrs_(uplo, n, nrhs, &ap[1], &ipiv[1], &b[b_offset], ldb, info); } - return 0; + return; /* End of ZSPSV */ diff --git a/lapack-netlib/SRC/zspsvx.c b/lapack-netlib/SRC/zspsvx.c index ceb5b014d7..f51ca179eb 100644 --- a/lapack-netlib/SRC/zspsvx.c +++ b/lapack-netlib/SRC/zspsvx.c @@ -789,7 +789,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zspsvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zspsvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *ap, doublecomplex *afp, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, doublecomplex * @@ -801,16 +801,17 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dlamch_(char *); logical nofact; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacpy_( char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal zlansp_(char *, char *, integer *, doublecomplex *, doublereal *); - extern /* Subroutine */ int zspcon_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zspcon_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zsprfs_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, @@ -866,7 +867,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPSVX", &i__1, (ftnlen)6); - return 0; + return; } if (nofact) { @@ -881,7 +882,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -910,7 +911,7 @@ f"> */ *info = *n + 1; } - return 0; + return; /* End of ZSPSVX */ diff --git a/lapack-netlib/SRC/zsptrf.c b/lapack-netlib/SRC/zsptrf.c index d3623c7fd8..e423f831e8 100644 --- a/lapack-netlib/SRC/zsptrf.c +++ b/lapack-netlib/SRC/zsptrf.c @@ -672,7 +672,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zsptrf_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zsptrf_(char *uplo, integer *n, doublecomplex *ap, integer *ipiv, integer *info) { /* System generated locals */ @@ -682,18 +682,18 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int zspr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zspr_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i__, j, k; doublecomplex t; doublereal alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; logical upper; doublecomplex r1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d12, d21, d22; integer kc, kk, kp; @@ -734,7 +734,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPTRF", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1296,7 +1296,7 @@ f"> */ } L110: - return 0; + return; /* End of ZSPTRF */ diff --git a/lapack-netlib/SRC/zsptri.c b/lapack-netlib/SRC/zsptri.c index 1c537f14bc..7a3d976313 100644 --- a/lapack-netlib/SRC/zsptri.c +++ b/lapack-netlib/SRC/zsptri.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsptri_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zsptri_(char *uplo, integer *n, doublecomplex *ap, integer *ipiv, doublecomplex *work, integer *info) { /* System generated locals */ @@ -638,11 +638,11 @@ f"> */ extern logical lsame_(char *, char *); integer kstep; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zspmv_(char *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -680,13 +680,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -699,7 +699,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = kp; if (ipiv[*info] > 0 && (ap[i__1].r == 0. && ap[i__1].i == 0.)) { - return 0; + return; } kp -= *info; /* L10: */ @@ -713,7 +713,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = kp; if (ipiv[*info] > 0 && (ap[i__2].r == 0. && ap[i__2].i == 0.)) { - return 0; + return; } kp = kp + *n - *info + 1; /* L20: */ @@ -1047,7 +1047,7 @@ f"> */ ; } - return 0; + return; /* End of ZSPTRI */ diff --git a/lapack-netlib/SRC/zsptrs.c b/lapack-netlib/SRC/zsptrs.c index e6e76a7936..45cb7a79af 100644 --- a/lapack-netlib/SRC/zsptrs.c +++ b/lapack-netlib/SRC/zsptrs.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsptrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsptrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *ap, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -642,12 +642,12 @@ f"> */ integer j, k; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -688,13 +688,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1049,7 +1049,7 @@ f"> */ ; } - return 0; + return; /* End of ZSPTRS */ diff --git a/lapack-netlib/SRC/zstedc.c b/lapack-netlib/SRC/zstedc.c index 4cfc418403..b757471331 100644 --- a/lapack-netlib/SRC/zstedc.c +++ b/lapack-netlib/SRC/zstedc.c @@ -728,7 +728,7 @@ f"> */ /* > at Berkeley, USA */ /* ===================================================================== */ -/* Subroutine */ int zstedc_(char *compz, integer *n, doublereal *d__, +/* Subroutine */ void zstedc_(char *compz, integer *n, doublereal *d__, doublereal *e, doublecomplex *z__, integer *ldz, doublecomplex *work, integer *lwork, doublereal *rwork, integer *lrwork, integer *iwork, integer *liwork, integer *info) @@ -743,36 +743,37 @@ f"> */ doublereal p; extern logical lsame_(char *, char *); integer lwmin, start; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaed0_(integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *, integer *); integer ii, ll; extern doublereal dlamch_(char *); - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *), dstedc_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *, integer *, integer *, integer *), dlaset_( char *, integer *, integer *, doublereal *, doublereal *, - doublereal *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer finish; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dsterf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dsterf_(integer *, doublereal *, doublereal *, integer *), zlacrm_(integer *, integer *, doublecomplex *, integer *, doublereal *, integer *, doublecomplex *, integer *, doublereal *); integer liwmin, icompz; - extern /* Subroutine */ int dsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsteqr_(char *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, integer *), zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal orgnrm; integer lrwmin; logical lquery; integer smlsiz; - extern /* Subroutine */ int zsteqr_(char *, integer *, doublereal *, + extern /* Subroutine */ void zsteqr_(char *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublereal *, integer *); integer lgn; doublereal eps; @@ -836,10 +837,10 @@ f"> */ lrwmin = *n - 1 << 1; } else if (icompz == 1) { lgn = (integer) (log((doublereal) (*n)) / log(2.)); - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } - if (pow_ii(&c__2, &lgn) < *n) { + if (pow_ii(c__2, lgn) < *n) { ++lgn; } lwmin = *n * *n; @@ -870,22 +871,22 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSTEDC", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (icompz != 0) { i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* If the following conditional clause is removed, then the routine */ @@ -1050,7 +1051,7 @@ f"> */ rwork[1] = (doublereal) lrwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZSTEDC */ diff --git a/lapack-netlib/SRC/zstegr.c b/lapack-netlib/SRC/zstegr.c index 6d5961e613..d685404cdc 100644 --- a/lapack-netlib/SRC/zstegr.c +++ b/lapack-netlib/SRC/zstegr.c @@ -772,7 +772,7 @@ f"> */ /* > Christof Voemel, LBNL/NERSC, USA \n */ /* ===================================================================== */ -/* Subroutine */ int zstegr_(char *jobz, char *range, integer *n, doublereal * +/* Subroutine */ void zstegr_(char *jobz, char *range, integer *n, doublereal * d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, doublereal *abstol, integer *m, doublereal *w, doublecomplex *z__, integer *ldz, integer *isuppz, doublereal *work, @@ -783,7 +783,7 @@ f"> */ /* Local variables */ logical tryrac; - extern /* Subroutine */ int zstemr_(char *, char *, integer *, doublereal + extern /* Subroutine */ void zstemr_(char *, char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, integer *, logical *, doublereal *, integer *, integer *, integer @@ -818,6 +818,6 @@ f"> */ /* End of ZSTEGR */ - return 0; + return; } /* zstegr_ */ diff --git a/lapack-netlib/SRC/zstein.c b/lapack-netlib/SRC/zstein.c index 4f47b75d8d..bbe439c485 100644 --- a/lapack-netlib/SRC/zstein.c +++ b/lapack-netlib/SRC/zstein.c @@ -696,7 +696,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zstein_(integer *n, doublereal *d__, doublereal *e, +/* Subroutine */ void zstein_(integer *n, doublereal *d__, doublereal *e, integer *m, doublereal *w, integer *iblock, integer *isplit, doublecomplex *z__, integer *ldz, doublereal *work, integer *iwork, integer *ifail, integer *info) @@ -710,26 +710,27 @@ f"> */ integer jblk, nblk, jmax; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer iseed[4], gpind, iinfo; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer b1, j1; doublereal ortol; integer indrv1, indrv2, indrv3, indrv4, indrv5, bn; extern doublereal dlamch_(char *); integer jr; - extern /* Subroutine */ int dlagtf_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlagtf_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer * , integer *); doublereal xj; extern integer idamax_(integer *, doublereal *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), dlagts_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlagts_( integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer nrmchk; - extern /* Subroutine */ int dlarnv_(integer *, integer *, integer *, + extern /* Subroutine */ void dlarnv_(integer *, integer *, integer *, doublereal *); integer blksiz; doublereal onenrm, dtpcrt, pertol, scl, eps, sep, nrm, tol; @@ -795,17 +796,17 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSTEIN", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } else if (*n == 1) { i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; - return 0; + return; } /* Get machine constants. */ @@ -1033,7 +1034,7 @@ f"> */ ; } - return 0; + return; /* End of ZSTEIN */ diff --git a/lapack-netlib/SRC/zstemr.c b/lapack-netlib/SRC/zstemr.c index dc594da6d1..28af7cff56 100644 --- a/lapack-netlib/SRC/zstemr.c +++ b/lapack-netlib/SRC/zstemr.c @@ -850,7 +850,7 @@ f"> */ /* > Christof Voemel, University of California, Berkeley, USA \n */ /* ===================================================================== */ -/* Subroutine */ int zstemr_(char *jobz, char *range, integer *n, doublereal * +/* Subroutine */ void zstemr_(char *jobz, char *range, integer *n, doublereal * d__, doublereal *e, doublereal *vl, doublereal *vu, integer *il, integer *iu, integer *m, doublereal *w, doublecomplex *z__, integer * ldz, integer *nzc, integer *isuppz, logical *tryrac, doublereal *work, @@ -865,23 +865,23 @@ f"> */ doublereal rmin, rmax; integer itmp; doublereal tnrm; - extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); integer inde2, itmp2; doublereal rtol1, rtol2; integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); doublereal scale; integer indgp; extern logical lsame_(char *, char *); integer iinfo, iindw, ilast; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer lwmin; logical wantz; doublereal r1, r2; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlaev2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -893,7 +893,7 @@ f"> */ integer ibegin, iindbl; doublereal sn, wl; logical valeig; - extern /* Subroutine */ int dlarrc_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlarrc_(char *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *), dlarre_(char *, integer *, doublereal *, doublereal *, integer *, integer *, @@ -903,21 +903,22 @@ f"> */ doublereal *, doublereal *, doublereal *, integer *, integer *); integer wbegin; doublereal safmin, wu; - extern /* Subroutine */ int dlarrj_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrj_(integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, integer *, doublereal *, doublereal *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer inderr, iindwk, indgrs, offset; extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dlarrr_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dlarrr_(integer *, doublereal *, doublereal *, integer *), dlasrt_(char *, integer *, doublereal *, integer *); doublereal thresh; integer iinspl, indwrk, ifirst, liwmin, nzcmin; doublereal pivmin; integer nsplit; doublereal smlnum; - extern /* Subroutine */ int zlarrv_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void zlarrv_(integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, integer *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, integer *, integer *, @@ -1045,16 +1046,16 @@ f"> */ i__1 = -(*info); xerbla_("ZSTEMR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery || zquery) { - return 0; + return; } /* Handle N = 0, 1, and 2 cases immediately */ *m = 0; if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -1073,7 +1074,7 @@ f"> */ isuppz[1] = 1; isuppz[2] = 1; } - return 0; + return; } if (*n == 2) { @@ -1226,7 +1227,7 @@ f"> */ work[indgrs], &pivmin, &work[indwrk], &iwork[iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 10; - return 0; + return; } /* Note that if RANGE .NE. 'V', DLARRE computes bounds on the desired */ /* part of the spectrum. All desired eigenvalues are contained in */ @@ -1243,7 +1244,7 @@ f"> */ iindwk], &iinfo); if (iinfo != 0) { *info = abs(iinfo) + 20; - return 0; + return; } } else { /* DLARRE computes eigenvalues of the (shifted) root representation */ @@ -1312,7 +1313,7 @@ f"> */ dlasrt_("I", m, &w[1], &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } } else { i__1 = *m - 1; @@ -1349,7 +1350,7 @@ f"> */ work[1] = (doublereal) lwmin; iwork[1] = liwmin; - return 0; + return; /* End of ZSTEMR */ diff --git a/lapack-netlib/SRC/zsteqr.c b/lapack-netlib/SRC/zsteqr.c index 486596f914..42782baab0 100644 --- a/lapack-netlib/SRC/zsteqr.c +++ b/lapack-netlib/SRC/zsteqr.c @@ -650,7 +650,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsteqr_(char *compz, integer *n, doublereal *d__, +/* Subroutine */ void zsteqr_(char *compz, integer *n, doublereal *d__, doublereal *e, doublecomplex *z__, integer *ldz, doublereal *work, integer *info) { @@ -660,17 +660,17 @@ f"> */ /* Local variables */ integer lend, jtot; - extern /* Subroutine */ int dlae2_(doublereal *, doublereal *, doublereal + extern /* Subroutine */ void dlae2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal b, c__, f, g; integer i__, j, k, l, m; doublereal p, r__, s; extern logical lsame_(char *, char *); doublereal anorm; - extern /* Subroutine */ int zlasr_(char *, char *, char *, integer *, + extern /* Subroutine */ void zlasr_(char *, char *, char *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *); integer l1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlaev2_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); @@ -679,22 +679,22 @@ f"> */ integer ii; extern doublereal dlamch_(char *); integer mm, iscale; - extern /* Subroutine */ int dlascl_(char *, integer *, integer *, + extern /* Subroutine */ void dlascl_(char *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *, integer *); doublereal safmin; - extern /* Subroutine */ int dlartg_(doublereal *, doublereal *, + extern /* Subroutine */ void dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, doublereal *); doublereal safmax; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal dlanst_(char *, integer *, doublereal *, doublereal *); - extern /* Subroutine */ int dlasrt_(char *, integer *, doublereal *, + extern /* Subroutine */ void dlasrt_(char *, integer *, doublereal *, integer *); integer lendsv; doublereal ssfmin; integer nmaxit, icompz; doublereal ssfmax; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer lm1, mm1, nm1; doublereal rt1, rt2, eps; @@ -743,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSTEQR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -757,7 +757,7 @@ f"> */ i__1 = z_dim1 + 1; z__[i__1].r = 1., z__[i__1].i = 0.; } - return 0; + return; } /* Determine the unit roundoff and over/underflow thresholds. */ @@ -1130,7 +1130,7 @@ f"> */ } /* L150: */ } - return 0; + return; } goto L10; @@ -1169,7 +1169,7 @@ f"> */ /* L180: */ } } - return 0; + return; /* End of ZSTEQR */ diff --git a/lapack-netlib/SRC/zsycon.c b/lapack-netlib/SRC/zsycon.c index 0fa46d844b..2842546af9 100644 --- a/lapack-netlib/SRC/zsycon.c +++ b/lapack-netlib/SRC/zsycon.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsycon_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsycon_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) { @@ -649,11 +649,11 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; - extern /* Subroutine */ int zsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); @@ -691,7 +691,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -699,9 +699,9 @@ f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -713,7 +713,7 @@ f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -725,7 +725,7 @@ f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -751,7 +751,7 @@ f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZSYCON */ diff --git a/lapack-netlib/SRC/zsycon_3.c b/lapack-netlib/SRC/zsycon_3.c index 41454b37bf..cd0bf7f96c 100644 --- a/lapack-netlib/SRC/zsycon_3.c +++ b/lapack-netlib/SRC/zsycon_3.c @@ -678,7 +678,7 @@ static integer c__1 = 1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsycon_3_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsycon_3_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) { @@ -687,16 +687,16 @@ static integer c__1 = 1; /* Local variables */ integer kase; - extern /* Subroutine */ int zsytrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_3_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_( char *, integer *, ftnlen); doublereal ainvnm; @@ -734,7 +734,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYCON_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ @@ -742,9 +742,9 @@ static integer c__1 = 1; *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -756,7 +756,7 @@ static integer c__1 = 1; for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } } } else { @@ -767,7 +767,7 @@ static integer c__1 = 1; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } } } @@ -792,7 +792,7 @@ static integer c__1 = 1; *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZSYCON_3 */ diff --git a/lapack-netlib/SRC/zsycon_rook.c b/lapack-netlib/SRC/zsycon_rook.c index 2c2b6aea91..0fe88cbdb7 100644 --- a/lapack-netlib/SRC/zsycon_rook.c +++ b/lapack-netlib/SRC/zsycon_rook.c @@ -651,7 +651,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsycon_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsycon_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublereal *anorm, doublereal *rcond, doublecomplex *work, integer *info) { @@ -660,16 +660,16 @@ rook.f"> */ /* Local variables */ integer kase; - extern /* Subroutine */ int zsytrs_rook_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_rook_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); integer i__; extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, - doublecomplex *, doublereal *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, + doublecomplex *, doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal ainvnm; @@ -706,7 +706,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYCON_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ @@ -714,9 +714,9 @@ rook.f"> */ *rcond = 0.; if (*n == 0) { *rcond = 1.; - return 0; + return; } else if (*anorm <= 0.) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -728,7 +728,7 @@ rook.f"> */ for (i__ = *n; i__ >= 1; --i__) { i__1 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -740,7 +740,7 @@ rook.f"> */ for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__ + i__ * a_dim1; if (ipiv[i__] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -766,7 +766,7 @@ rook.f"> */ *rcond = 1. / ainvnm / *anorm; } - return 0; + return; /* End of ZSYCON_ROOK */ diff --git a/lapack-netlib/SRC/zsyconv.c b/lapack-netlib/SRC/zsyconv.c index c8afcec2d8..b0acec29a5 100644 --- a/lapack-netlib/SRC/zsyconv.c +++ b/lapack-netlib/SRC/zsyconv.c @@ -623,7 +623,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsyconv_(char *uplo, char *way, integer *n, +/* Subroutine */ void zsyconv_(char *uplo, char *way, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *e, integer *info) { @@ -673,13 +673,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYCONV", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -930,7 +930,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of ZSYCONV */ diff --git a/lapack-netlib/SRC/zsyconvf.c b/lapack-netlib/SRC/zsyconvf.c index 927199525a..1313bb4e65 100644 --- a/lapack-netlib/SRC/zsyconvf.c +++ b/lapack-netlib/SRC/zsyconvf.c @@ -718,7 +718,7 @@ f.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsyconvf_(char *uplo, char *way, integer *n, +/* Subroutine */ void zsyconvf_(char *uplo, char *way, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, integer *info) { @@ -729,7 +729,7 @@ f.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -769,13 +769,13 @@ f.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYCONVF", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1091,7 +1091,7 @@ f.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of ZSYCONVF */ diff --git a/lapack-netlib/SRC/zsyconvf_rook.c b/lapack-netlib/SRC/zsyconvf_rook.c index 3362b91e0b..05cefa81a2 100644 --- a/lapack-netlib/SRC/zsyconvf_rook.c +++ b/lapack-netlib/SRC/zsyconvf_rook.c @@ -709,7 +709,7 @@ f_rook.f"> */ /* > */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsyconvf_rook_(char *uplo, char *way, integer *n, +/* Subroutine */ void zsyconvf_rook_(char *uplo, char *way, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, integer *info) { @@ -720,7 +720,7 @@ f_rook.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ip; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -761,13 +761,13 @@ f_rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYCONVF_ROOK", &i__1, (ftnlen)13); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -1081,7 +1081,7 @@ f_rook.f"> */ /* End A is LOWER */ } - return 0; + return; /* End of ZSYCONVF_ROOK */ diff --git a/lapack-netlib/SRC/zsyequb.c b/lapack-netlib/SRC/zsyequb.c index c047b320e9..9f740710f6 100644 --- a/lapack-netlib/SRC/zsyequb.c +++ b/lapack-netlib/SRC/zsyequb.c @@ -645,7 +645,7 @@ static integer c__1 = 1; /* > Tech report version: http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.3.1679 */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zsyequb_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsyequb_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublereal *s, doublereal *scond, doublereal *amax, doublecomplex *work, integer *info) { @@ -667,7 +667,7 @@ static integer c__1 = 1; logical up; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum, smlnum; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal avg, std, tol; @@ -702,7 +702,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYEQUB", &i__1, (ftnlen)7); - return 0; + return; } up = lsame_(uplo, "U"); *amax = 0.; @@ -711,7 +711,7 @@ static integer c__1 = 1; if (*n == 0) { *scond = 1.; - return 0; + return; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -906,7 +906,7 @@ static integer c__1 = 1; d__ = c1 * c1 - c0 * 4 * c2; if (d__ <= 0.) { *info = -1; - return 0; + return; } si = c0 * -2 / (c1 + sqrt(d__)); d__ = si - s[i__]; @@ -993,6 +993,6 @@ static integer c__1 = 1; } *scond = f2cmax(smin,smlnum) / f2cmin(smax,bignum); - return 0; + return; } /* zsyequb_ */ diff --git a/lapack-netlib/SRC/zsymv.c b/lapack-netlib/SRC/zsymv.c index a14c33cd25..7729d3f439 100644 --- a/lapack-netlib/SRC/zsymv.c +++ b/lapack-netlib/SRC/zsymv.c @@ -666,7 +666,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16SYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zsymv_(char *uplo, integer *n, doublecomplex *alpha, +/* Subroutine */ void zsymv_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *incx, doublecomplex *beta, doublecomplex *y, integer *incy) { @@ -716,14 +716,14 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("ZSYMV ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0. && alpha->i == 0. && (beta->r == 1. && beta->i == 0.)) { - return 0; + return; } /* Set up the start points in X and Y. */ @@ -792,7 +792,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } if (alpha->r == 0. && alpha->i == 0.) { - return 0; + return; } if (lsame_(uplo, "U")) { @@ -984,7 +984,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of ZSYMV */ diff --git a/lapack-netlib/SRC/zsyr.c b/lapack-netlib/SRC/zsyr.c index 6a2d5f8ba5..bacc1cf549 100644 --- a/lapack-netlib/SRC/zsyr.c +++ b/lapack-netlib/SRC/zsyr.c @@ -644,7 +644,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16SYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zsyr_(char *uplo, integer *n, doublecomplex *alpha, +/* Subroutine */ void zsyr_(char *uplo, integer *n, doublecomplex *alpha, doublecomplex *x, integer *incx, doublecomplex *a, integer *lda) { /* System generated locals */ @@ -690,13 +690,13 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } if (info != 0) { xerbla_("ZSYR ", &info, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0 || alpha->r == 0. && alpha->i == 0.) { - return 0; + return; } /* Set the start point in X if the increment is not unity. */ @@ -835,7 +835,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } } - return 0; + return; /* End of ZSYR */ diff --git a/lapack-netlib/SRC/zsyrfs.c b/lapack-netlib/SRC/zsyrfs.c index c445301ac3..31a54c1209 100644 --- a/lapack-netlib/SRC/zsyrfs.c +++ b/lapack-netlib/SRC/zsyrfs.c @@ -705,7 +705,7 @@ f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsyrfs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsyrfs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer *ldaf, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, @@ -725,7 +725,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3], count; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zsymv_( char *, integer *, doublecomplex *, doublecomplex *, integer *, @@ -738,7 +738,7 @@ f"> */ doublereal safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal lstres; - extern /* Subroutine */ int zsytrs_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); doublereal eps; @@ -795,7 +795,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -807,7 +807,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ @@ -1043,7 +1043,7 @@ f"> */ /* L140: */ } - return 0; + return; /* End of ZSYRFS */ diff --git a/lapack-netlib/SRC/zsysv.c b/lapack-netlib/SRC/zsysv.c index b42cb23c5f..ffa5a09f9f 100644 --- a/lapack-netlib/SRC/zsysv.c +++ b/lapack-netlib/SRC/zsysv.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \ingroup complex16SYsolve */ /* ===================================================================== */ -/* Subroutine */ int zsysv_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsysv_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -695,7 +695,7 @@ static integer c_n1 = -1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zsytrf_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsytrf_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zsytrs_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zsytrs2_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -754,9 +754,9 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSV ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -786,7 +786,7 @@ static integer c_n1 = -1; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYSV */ diff --git a/lapack-netlib/SRC/zsysv.f b/lapack-netlib/SRC/zsysv.f index ed173dadca..44f1e25b14 100644 --- a/lapack-netlib/SRC/zsysv.f +++ b/lapack-netlib/SRC/zsysv.f @@ -223,7 +223,7 @@ SUBROUTINE ZSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL ZSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysv_aa.c b/lapack-netlib/SRC/zsysv_aa.c index 2b07605705..2b226200ea 100644 --- a/lapack-netlib/SRC/zsysv_aa.c +++ b/lapack-netlib/SRC/zsysv_aa.c @@ -674,7 +674,7 @@ a.f"> */ /* > \ingroup complex16SYsolve */ /* ===================================================================== */ -/* Subroutine */ int zsysv_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsysv_aa_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -684,10 +684,11 @@ a.f"> */ /* Local variables */ extern logical lsame_(char *, char *); integer lwkopt_sytrf__, lwkopt_sytrs__; - extern /* Subroutine */ int zsytrf_aa_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zsytrf_aa_(char *, integer *, doublecomplex * , integer *, integer *, doublecomplex *, integer *, integer *), zsytrs_aa_(char *, integer *, integer *, doublecomplex * , integer *, integer *, doublecomplex *, integer *, doublecomplex - *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -748,9 +749,9 @@ a.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSV_AA ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U**T*T*U or A = L*T*L**T. */ @@ -767,7 +768,7 @@ a.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYSV_AA */ diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.c b/lapack-netlib/SRC/zsysv_aa_2stage.c index 542cd702c5..d2ed5e3c8f 100644 --- a/lapack-netlib/SRC/zsysv_aa_2stage.c +++ b/lapack-netlib/SRC/zsysv_aa_2stage.c @@ -698,7 +698,7 @@ asen_2stage.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsysv_aa_2stage_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *tb, integer *ltb, integer *ipiv, integer *ipiv2, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) @@ -707,7 +707,7 @@ asen_2stage.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zsytrf_aa_2stage_(char *, integer *, + extern /* Subroutine */ void zsytrf_aa_2stage_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, doublecomplex *, integer *, integer *), zsytrs_aa_2stage_(char *, integer *, integer *, doublecomplex *, @@ -774,9 +774,9 @@ asen_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSV_AA_2STAGE", &i__1, (ftnlen)15); - return 0; + return; } else if (wquery || tquery) { - return 0; + return; } @@ -797,6 +797,6 @@ asen_2stage.f"> */ /* End of ZSYSV_AA_2STAGE */ - return 0; + return; } /* zsysv_aa_2stage__ */ diff --git a/lapack-netlib/SRC/zsysv_rk.c b/lapack-netlib/SRC/zsysv_rk.c index 82eef5a43c..01a75aaec8 100644 --- a/lapack-netlib/SRC/zsysv_rk.c +++ b/lapack-netlib/SRC/zsysv_rk.c @@ -740,7 +740,7 @@ k.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsysv_rk_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsysv_rk_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) @@ -749,13 +749,14 @@ k.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zsytrs_3_(char *, integer *, integer *, + extern /* Subroutine */ void zsytrs_3_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int zsytrf_rk_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zsytrf_rk_(char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer - *, integer *), xerbla_(char *, integer *, ftnlen); + *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; @@ -813,9 +814,9 @@ k.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSV_RK ", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = P*U*D*(U**T)*(P**T) or */ @@ -835,7 +836,7 @@ k.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYSV_RK */ diff --git a/lapack-netlib/SRC/zsysv_rk.f b/lapack-netlib/SRC/zsysv_rk.f index df828ee337..8d9fb82c87 100644 --- a/lapack-netlib/SRC/zsysv_rk.f +++ b/lapack-netlib/SRC/zsysv_rk.f @@ -280,7 +280,7 @@ SUBROUTINE ZSYSV_RK( UPLO, N, NRHS, A, LDA, E, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL ZSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysv_rook.c b/lapack-netlib/SRC/zsysv_rook.c index 8ddf820e4a..c0ea7bb1f7 100644 --- a/lapack-netlib/SRC/zsysv_rook.c +++ b/lapack-netlib/SRC/zsysv_rook.c @@ -717,7 +717,7 @@ ook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsysv_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsysv_rook_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -725,7 +725,7 @@ ook.f"> */ integer a_dim1, a_offset, b_dim1, b_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zsytrf_rook_(char *, integer *, + extern /* Subroutine */ void zsytrf_rook_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zsytrs_rook_(char *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, @@ -788,9 +788,9 @@ ook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSV_ROOK ", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Compute the factorization A = U*D*U**T or A = L*D*L**T. */ @@ -809,7 +809,7 @@ ook.f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYSV_ROOK */ diff --git a/lapack-netlib/SRC/zsysv_rook.f b/lapack-netlib/SRC/zsysv_rook.f index 7c9fb4bf64..7453395122 100644 --- a/lapack-netlib/SRC/zsysv_rook.f +++ b/lapack-netlib/SRC/zsysv_rook.f @@ -256,7 +256,7 @@ SUBROUTINE ZSYSV_ROOK( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK, LWKOPT = 1 ELSE CALL ZSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO ) - LWKOPT = DBLE( WORK(1) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) END IF WORK( 1 ) = LWKOPT END IF diff --git a/lapack-netlib/SRC/zsysvx.c b/lapack-netlib/SRC/zsysvx.c index 2d0b3ec97f..29302ff767 100644 --- a/lapack-netlib/SRC/zsysvx.c +++ b/lapack-netlib/SRC/zsysvx.c @@ -797,7 +797,7 @@ f"> */ /* > \ingroup complex16SYsolve */ /* ===================================================================== */ -/* Subroutine */ int zsysvx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zsysvx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, doublereal *ferr, doublereal *berr, @@ -816,13 +816,13 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lwkopt; logical lquery; extern doublereal zlansy_(char *, char *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zsycon_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsycon_(char *, integer *, doublecomplex *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *), zsyrfs_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, @@ -909,9 +909,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSVX", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } if (nofact) { @@ -926,7 +926,7 @@ f"> */ if (*info > 0) { *rcond = 0.; - return 0; + return; } } @@ -960,7 +960,7 @@ f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYSVX */ diff --git a/lapack-netlib/SRC/zsysvxx.c b/lapack-netlib/SRC/zsysvxx.c index 2968791eeb..ff18cc709c 100644 --- a/lapack-netlib/SRC/zsysvxx.c +++ b/lapack-netlib/SRC/zsysvxx.c @@ -1012,7 +1012,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16SYsolve */ /* ===================================================================== */ -/* Subroutine */ int zsysvxx_(char *fact, char *uplo, integer *n, integer * +/* Subroutine */ void zsysvxx_(char *fact, char *uplo, integer *n, integer * nrhs, doublecomplex *a, integer *lda, doublecomplex *af, integer * ldaf, integer *ipiv, char *equed, doublereal *s, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *rcond, @@ -1028,7 +1028,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ doublereal d__1, d__2; /* Local variables */ - extern /* Subroutine */ int zsyrfsx_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zsyrfsx_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer * , doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, integer *, doublereal *, @@ -1047,10 +1047,10 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; integer infequ; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zlaqsy_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zlaqsy_(char *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, char *), zsytrf_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *), zlascl2_(integer *, integer *, doublereal *, doublecomplex *, integer *), zsytrs_(char *, integer *, integer *, @@ -1166,7 +1166,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYSVXX", &i__1, (ftnlen)7); - return 0; + return; } if (equil) { @@ -1211,7 +1211,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ *rpvgrw = zla_syrpvgrw_(uplo, n, info, &a[a_offset], lda, & af[af_offset], ldaf, &ipiv[1], &rwork[1]); } - return 0; + return; } } @@ -1243,7 +1243,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ zlascl2_(n, nrhs, &s[1], &x[x_offset], ldx); } - return 0; + return; /* End of ZSYSVXX */ diff --git a/lapack-netlib/SRC/zsyswapr.c b/lapack-netlib/SRC/zsyswapr.c index bc80be4eef..86f925d1a5 100644 --- a/lapack-netlib/SRC/zsyswapr.c +++ b/lapack-netlib/SRC/zsyswapr.c @@ -616,7 +616,7 @@ r.f"> */ /* > \ingroup complex16SYauxiliary */ /* ===================================================================== */ -/* Subroutine */ int zsyswapr_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsyswapr_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *i1, integer *i2) { /* System generated locals */ @@ -626,7 +626,7 @@ r.f"> */ integer i__; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex tmp; @@ -736,6 +736,6 @@ r.f"> */ } } - return 0; + return; } /* zsyswapr_ */ diff --git a/lapack-netlib/SRC/zsyswapr.f b/lapack-netlib/SRC/zsyswapr.f index 1f1a878574..eb3c98c34b 100644 --- a/lapack-netlib/SRC/zsyswapr.f +++ b/lapack-netlib/SRC/zsyswapr.f @@ -57,16 +57,14 @@ *> *> \param[in,out] A *> \verbatim -*> A is COMPLEX*16 array, dimension (LDA,N) -*> On entry, the NB diagonal matrix D and the multipliers -*> used to obtain the factor U or L as computed by ZSYTRF. -*> -*> On exit, if INFO = 0, the (symmetric) inverse of the original -*> matrix. If UPLO = 'U', the upper triangular part of the -*> inverse is formed and the part of A below the diagonal is not -*> referenced; if UPLO = 'L' the lower triangular part of the -*> inverse is formed and the part of A above the diagonal is -*> not referenced. +*> A is COMPLEX*16 array, dimension (LDA,*) +*> On entry, the N-by-N matrix A. On exit, the permuted matrix +*> where the rows I1 and I2 and columns I1 and I2 are interchanged. +*> If UPLO = 'U', the interchanges are applied to the upper +*> triangular part and the strictly lower triangular part of A is +*> not referenced; if UPLO = 'L', the interchanges are applied to +*> the lower triangular part and the part of A above the diagonal +*> is not referenced. *> \endverbatim *> *> \param[in] LDA @@ -109,14 +107,13 @@ SUBROUTINE ZSYSWAPR( UPLO, N, A, LDA, I1, I2) INTEGER I1, I2, LDA, N * .. * .. Array Arguments .. - COMPLEX*16 A( LDA, N ) + COMPLEX*16 A( LDA, * ) * * ===================================================================== * * .. * .. Local Scalars .. LOGICAL UPPER - INTEGER I COMPLEX*16 TMP * * .. External Functions .. @@ -143,19 +140,12 @@ SUBROUTINE ZSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1,I1+I) - A(I1,I1+I)=A(I1+I,I2) - A(I1+I,I2)=TMP - END DO + CALL ZSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 ) * * third swap * - swap row I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I1,I) - A(I1,I)=A(I2,I) - A(I2,I)=TMP - END DO + IF ( I2.LT.N ) + $ CALL ZSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA ) * ELSE * @@ -171,19 +161,12 @@ SUBROUTINE ZSYSWAPR( UPLO, N, A, LDA, I1, I2) A(I1,I1)=A(I2,I2) A(I2,I2)=TMP * - DO I=1,I2-I1-1 - TMP=A(I1+I,I1) - A(I1+I,I1)=A(I2,I1+I) - A(I2,I1+I)=TMP - END DO + CALL ZSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA ) * * third swap * - swap col I1 and I2 from I2+1 to N - DO I=I2+1,N - TMP=A(I,I1) - A(I,I1)=A(I,I2) - A(I,I2)=TMP - END DO + IF ( I2.LT.N ) + $ CALL ZSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 ) * ENDIF END SUBROUTINE ZSYSWAPR diff --git a/lapack-netlib/SRC/zsytf2.c b/lapack-netlib/SRC/zsytf2.c index ffc53186c8..7d84395dfa 100644 --- a/lapack-netlib/SRC/zsytf2.c +++ b/lapack-netlib/SRC/zsytf2.c @@ -706,7 +706,7 @@ f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytf2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytf2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -716,18 +716,18 @@ f"> */ /* Local variables */ integer imax, jmax; - extern /* Subroutine */ int zsyr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsyr_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer i__, j, k; doublecomplex t; doublereal alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; logical upper; doublecomplex r1; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d12, d21, d22; integer kk, kp; @@ -771,7 +771,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTF2", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1286,7 +1286,7 @@ f"> */ } L70: - return 0; + return; /* End of ZSYTF2 */ diff --git a/lapack-netlib/SRC/zsytf2_rk.c b/lapack-netlib/SRC/zsytf2_rk.c index 7be53b3c51..61596a9962 100644 --- a/lapack-netlib/SRC/zsytf2_rk.c +++ b/lapack-netlib/SRC/zsytf2_rk.c @@ -756,7 +756,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytf2_rk_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytf2_rk_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, integer *info) { /* System generated locals */ @@ -767,7 +767,7 @@ rk.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int zsyr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsyr_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer i__, j, k, p; doublecomplex t; @@ -775,11 +775,11 @@ rk.f"> */ extern logical lsame_(char *, char *); doublereal dtemp, sfmin; integer itemp; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer kstep; logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d12, d21, d22; integer ii, kk; @@ -825,7 +825,7 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTF2_RK", &i__1, (ftnlen)9); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1683,7 +1683,7 @@ rk.f"> */ ; } - return 0; + return; /* End of ZSYTF2_RK */ diff --git a/lapack-netlib/SRC/zsytf2_rook.c b/lapack-netlib/SRC/zsytf2_rook.c index 0087987e83..0ffd6dd784 100644 --- a/lapack-netlib/SRC/zsytf2_rook.c +++ b/lapack-netlib/SRC/zsytf2_rook.c @@ -709,7 +709,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytf2_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytf2_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, integer *info) { /* System generated locals */ @@ -720,18 +720,18 @@ rook.f"> */ /* Local variables */ logical done; integer imax, jmax; - extern /* Subroutine */ int zsyr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsyr_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer i__, j, k, p; doublecomplex t; doublereal alpha; extern logical lsame_(char *, char *); doublereal dtemp, sfmin; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer itemp, kstep; logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex d11, d12, d21, d22; integer ii, kk; @@ -776,7 +776,7 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTF2_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Initialize ALPHA for use in choosing pivot block size. */ @@ -1525,7 +1525,7 @@ rook.f"> */ L70: - return 0; + return; /* End of ZSYTF2_ROOK */ diff --git a/lapack-netlib/SRC/zsytrf.c b/lapack-netlib/SRC/zsytrf.c index 465eff88b8..9aac589ad5 100644 --- a/lapack-netlib/SRC/zsytrf.c +++ b/lapack-netlib/SRC/zsytrf.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zsytrf_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytrf_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -710,13 +710,13 @@ f"> */ integer nbmin, iinfo; logical upper; integer kb, nb; - extern /* Subroutine */ int zsytf2_(char *, integer *, doublecomplex *, - integer *, integer *, integer *), xerbla_(char *, integer - *, ftnlen); + extern /* Subroutine */ void zsytf2_(char *, integer *, doublecomplex *, + integer *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer ldwork; - extern /* Subroutine */ int zlasyf_(char *, integer *, integer *, integer + extern /* Subroutine */ void zlasyf_(char *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); integer lwkopt; @@ -769,9 +769,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -898,7 +898,7 @@ f"> */ L40: work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYTRF */ diff --git a/lapack-netlib/SRC/zsytrf_aa.c b/lapack-netlib/SRC/zsytrf_aa.c index 7a506a940f..29a3dcb7c4 100644 --- a/lapack-netlib/SRC/zsytrf_aa.c +++ b/lapack-netlib/SRC/zsytrf_aa.c @@ -648,7 +648,7 @@ aa.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytrf_aa_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytrf_aa_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -659,7 +659,7 @@ aa.f"> */ integer j; doublecomplex alpha; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -671,7 +671,7 @@ aa.f"> */ doublecomplex *, integer *); logical upper; integer k1, k2, j1, j2, j3; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer jb, nb, mj, nj; @@ -732,19 +732,19 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRF_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } ipiv[1] = 1; if (*n == 1) { - return 0; + return; } /* Adjust block size based on the workspace size */ @@ -1038,7 +1038,7 @@ aa.f"> */ } L20: - return 0; + return; /* End of ZSYTRF_AA */ diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.c b/lapack-netlib/SRC/zsytrf_aa_2stage.c index 328fdb5eb1..a3d6a39b6a 100644 --- a/lapack-netlib/SRC/zsytrf_aa_2stage.c +++ b/lapack-netlib/SRC/zsytrf_aa_2stage.c @@ -675,7 +675,7 @@ aa_2stage.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytrf_aa_2stage_(char *uplo, integer *n, doublecomplex +/* Subroutine */ void zsytrf_aa_2stage_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *tb, integer *ltb, integer *ipiv, integer *ipiv2, doublecomplex *work, integer *lwork, integer *info) { @@ -687,14 +687,14 @@ aa_2stage.f"> */ integer ldtb, i__, j, k; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer i1; logical upper; integer i2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, @@ -703,10 +703,11 @@ aa_2stage.f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zgbtrf_(integer *, integer *, integer *, - integer *, doublecomplex *, integer *, integer *, integer *), - zgetrf_(integer *, integer *, doublecomplex *, integer *, integer - *, integer *), zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zgbtrf_(integer *, integer *, integer *, + integer *, doublecomplex *, integer *, integer *, integer *); + extern int zgetrf_(integer *, integer *, doublecomplex *, integer *, integer + *, integer *); + extern void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); @@ -755,7 +756,7 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRF_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Answer the query */ @@ -773,13 +774,13 @@ aa_2stage.f"> */ } } if (tquery || wquery) { - return 0; + return; } /* Quick return */ if (*n == 0) { - return 0; + return; } /* Determine the number of the block size */ @@ -1280,7 +1281,7 @@ aa_2stage.f"> */ /* Factor the band matrix */ zgbtrf_(n, n, &nb, &nb, &tb[1], &ldtb, &ipiv2[1], info); - return 0; + return; /* End of ZSYTRF_AA_2STAGE */ diff --git a/lapack-netlib/SRC/zsytrf_rk.c b/lapack-netlib/SRC/zsytrf_rk.c index 232cab0bb3..a87821837c 100644 --- a/lapack-netlib/SRC/zsytrf_rk.c +++ b/lapack-netlib/SRC/zsytrf_rk.c @@ -774,7 +774,7 @@ rk.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytrf_rk_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytrf_rk_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -785,10 +785,10 @@ rk.f"> */ integer i__, k; extern logical lsame_(char *, char *); integer nbmin, iinfo; - extern /* Subroutine */ int zsytf2_rk_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zsytf2_rk_(char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, integer *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlasyf_rk_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -847,9 +847,9 @@ rk.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRF_RK", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -1035,7 +1035,7 @@ rk.f"> */ } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYTRF_RK */ diff --git a/lapack-netlib/SRC/zsytrf_rook.c b/lapack-netlib/SRC/zsytrf_rook.c index 28a5cb7e33..6c02245a2d 100644 --- a/lapack-netlib/SRC/zsytrf_rook.c +++ b/lapack-netlib/SRC/zsytrf_rook.c @@ -723,7 +723,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytrf_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytrf_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -731,7 +731,7 @@ rook.f"> */ integer a_dim1, a_offset, i__1, i__2; /* Local variables */ - extern /* Subroutine */ int zlasyf_rook_(char *, integer *, integer *, + extern /* Subroutine */ void zlasyf_rook_(char *, integer *, integer *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); integer j, k; @@ -745,7 +745,7 @@ rook.f"> */ integer ldwork, lwkopt; logical lquery; integer iws; - extern /* Subroutine */ int zsytf2_rook_(char *, integer *, + extern /* Subroutine */ void zsytf2_rook_(char *, integer *, doublecomplex *, integer *, integer *, integer *); @@ -796,9 +796,9 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRF_ROOK", &i__1, (ftnlen)11); - return 0; + return; } else if (lquery) { - return 0; + return; } nbmin = 2; @@ -928,7 +928,7 @@ rook.f"> */ L40: work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYTRF_ROOK */ diff --git a/lapack-netlib/SRC/zsytri.c b/lapack-netlib/SRC/zsytri.c index 78df5e9a17..cf9a702518 100644 --- a/lapack-netlib/SRC/zsytri.c +++ b/lapack-netlib/SRC/zsytri.c @@ -629,7 +629,7 @@ f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytri_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytri_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *info) { /* System generated locals */ @@ -643,11 +643,11 @@ f"> */ extern logical lsame_(char *, char *); integer kstep; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zsymv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -688,13 +688,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -706,7 +706,7 @@ f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -718,7 +718,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -1031,7 +1031,7 @@ f"> */ ; } - return 0; + return; /* End of ZSYTRI */ diff --git a/lapack-netlib/SRC/zsytri2.c b/lapack-netlib/SRC/zsytri2.c index d0cdd02b10..31f65bdaca 100644 --- a/lapack-netlib/SRC/zsytri2.c +++ b/lapack-netlib/SRC/zsytri2.c @@ -641,7 +641,7 @@ static integer c_n1 = -1; /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytri2_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytri2_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -649,7 +649,7 @@ static integer c_n1 = -1; integer a_dim1, a_offset, i__1; /* Local variables */ - extern /* Subroutine */ int zsytri2x_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsytri2x_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *, integer *); extern logical lsame_(char *, char *); integer nbmax; @@ -658,7 +658,7 @@ static integer c_n1 = -1; extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical lquery; - extern /* Subroutine */ int zsytri_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsytri_(char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *); integer minsize; @@ -710,13 +710,13 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRI2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { work[1].r = (doublereal) minsize, work[1].i = 0.; - return 0; + return; } if (*n == 0) { - return 0; + return; } if (nbmax >= *n) { zsytri_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], info); @@ -724,7 +724,7 @@ static integer c_n1 = -1; zsytri2x_(uplo, n, &a[a_offset], lda, &ipiv[1], &work[1], &nbmax, info); } - return 0; + return; /* End of ZSYTRI2 */ diff --git a/lapack-netlib/SRC/zsytri2x.c b/lapack-netlib/SRC/zsytri2x.c index f4a1d3b58c..cacd30379c 100644 --- a/lapack-netlib/SRC/zsytri2x.c +++ b/lapack-netlib/SRC/zsytri2x.c @@ -634,7 +634,7 @@ x.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytri2x_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytri2x_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *nb, integer *info) { @@ -647,29 +647,30 @@ x.f"> */ integer invd; doublecomplex akkp1, d__; integer i__, j, k; - extern /* Subroutine */ int zsyswapr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsyswapr_(char *, integer *, doublecomplex *, integer *, integer *, integer *); doublecomplex t; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer count; logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, u01_i_j__; integer u11; doublecomplex u11_i_j__; integer ip; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ztrtri_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern int ztrtri_( char *, char *, integer *, doublecomplex *, integer *, integer *); integer nnb, cut; doublecomplex akp1, u01_ip1_j__, u11_ip1_j__; - extern /* Subroutine */ int zsyconv_(char *, char *, integer *, + extern /* Subroutine */ void zsyconv_(char *, char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *); @@ -710,10 +711,10 @@ x.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRI2X", &i__1, (ftnlen)8); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Convert A */ @@ -731,7 +732,7 @@ x.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } } } else { @@ -742,7 +743,7 @@ x.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } } } @@ -1373,7 +1374,7 @@ x.f"> */ } } - return 0; + return; /* End of ZSYTRI2X */ diff --git a/lapack-netlib/SRC/zsytri_3.c b/lapack-netlib/SRC/zsytri_3.c index 655a2ae73b..9a1fe7b6d4 100644 --- a/lapack-netlib/SRC/zsytri_3.c +++ b/lapack-netlib/SRC/zsytri_3.c @@ -683,7 +683,7 @@ static integer c_n1 = -1; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytri_3_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytri_3_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *work, integer *lwork, integer *info) { @@ -693,7 +693,7 @@ static integer c_n1 = -1; /* Local variables */ extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zsytri_3x_(char *, integer *, doublecomplex * + extern /* Subroutine */ void zsytri_3x_(char *, integer *, doublecomplex * , integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer nb; @@ -749,16 +749,16 @@ static integer c_n1 = -1; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRI_3", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } zsytri_3x_(uplo, n, &a[a_offset], lda, &e[1], &ipiv[1], &work[1], &nb, @@ -766,7 +766,7 @@ static integer c_n1 = -1; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZSYTRI_3 */ diff --git a/lapack-netlib/SRC/zsytri_3x.c b/lapack-netlib/SRC/zsytri_3x.c index f5647906fd..e5575d2937 100644 --- a/lapack-netlib/SRC/zsytri_3x.c +++ b/lapack-netlib/SRC/zsytri_3x.c @@ -673,7 +673,7 @@ static doublecomplex c_b2 = {0.,0.}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytri_3x_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytri_3x_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *work, integer *nb, integer *info) { @@ -686,16 +686,16 @@ static doublecomplex c_b2 = {0.,0.}; integer invd; doublecomplex akkp1, d__; integer i__, j, k; - extern /* Subroutine */ int zsyswapr_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zsyswapr_(char *, integer *, doublecomplex *, integer *, integer *, integer *); doublecomplex t; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); doublecomplex ak, u01_i_j__; @@ -747,10 +747,10 @@ static doublecomplex c_b2 = {0.,0.}; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRI_3X", &i__1, (ftnlen)9); - return 0; + return; } if (*n == 0) { - return 0; + return; } /* Workspace got Non-diag elements of D */ @@ -771,7 +771,7 @@ static doublecomplex c_b2 = {0.,0.}; for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } } } else { @@ -782,7 +782,7 @@ static doublecomplex c_b2 = {0.,0.}; for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } } } @@ -1422,7 +1422,7 @@ static doublecomplex c_b2 = {0.,0.}; } - return 0; + return; /* End of ZSYTRI_3X */ diff --git a/lapack-netlib/SRC/zsytri_rook.c b/lapack-netlib/SRC/zsytri_rook.c index b4e38bfb89..2899d12f0e 100644 --- a/lapack-netlib/SRC/zsytri_rook.c +++ b/lapack-netlib/SRC/zsytri_rook.c @@ -644,7 +644,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytri_rook_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zsytri_rook_(char *uplo, integer *n, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *work, integer *info) { /* System generated locals */ @@ -658,11 +658,11 @@ rook.f"> */ extern logical lsame_(char *, char *); integer kstep; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); extern /* Double Complex */ VOID zdotu_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zsymv_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -703,13 +703,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRI_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check that the diagonal matrix D is nonsingular. */ @@ -721,7 +721,7 @@ rook.f"> */ for (*info = *n; *info >= 1; --(*info)) { i__1 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__1].r == 0. && a[i__1].i == 0.)) { - return 0; + return; } /* L10: */ } @@ -733,7 +733,7 @@ rook.f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (ipiv[*info] > 0 && (a[i__2].r == 0. && a[i__2].i == 0.)) { - return 0; + return; } /* L20: */ } @@ -1136,7 +1136,7 @@ rook.f"> */ ; } - return 0; + return; /* End of ZSYTRI_ROOK */ diff --git a/lapack-netlib/SRC/zsytrs.c b/lapack-netlib/SRC/zsytrs.c index 2c65b83f7f..a31b3e9f24 100644 --- a/lapack-netlib/SRC/zsytrs.c +++ b/lapack-netlib/SRC/zsytrs.c @@ -634,7 +634,7 @@ f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytrs_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsytrs_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -647,12 +647,12 @@ f"> */ integer j, k; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -697,13 +697,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1051,7 +1051,7 @@ f"> */ ; } - return 0; + return; /* End of ZSYTRS */ diff --git a/lapack-netlib/SRC/zsytrs2.c b/lapack-netlib/SRC/zsytrs2.c index 6a458f3d61..b377b7f5da 100644 --- a/lapack-netlib/SRC/zsytrs2.c +++ b/lapack-netlib/SRC/zsytrs2.c @@ -644,7 +644,7 @@ static doublecomplex c_b1 = {1.,0.}; /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytrs2_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsytrs2_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *info) { @@ -658,10 +658,10 @@ static doublecomplex c_b1 = {1.,0.}; extern logical lsame_(char *, char *); doublecomplex denom; integer iinfo; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char * , integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -669,7 +669,7 @@ static doublecomplex c_b1 = {1.,0.}; integer kp; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublecomplex akm1, bkm1; - extern /* Subroutine */ int zsyconv_(char *, char *, integer *, + extern /* Subroutine */ void zsyconv_(char *, char *, integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *); @@ -709,13 +709,13 @@ static doublecomplex c_b1 = {1.,0.}; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRS2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Convert A */ @@ -936,7 +936,7 @@ static doublecomplex c_b1 = {1.,0.}; zsyconv_(uplo, "R", n, &a[a_offset], lda, &ipiv[1], &work[1], &iinfo); - return 0; + return; /* End of ZSYTRS2 */ diff --git a/lapack-netlib/SRC/zsytrs_3.c b/lapack-netlib/SRC/zsytrs_3.c index 388a28c8cb..7d1763c041 100644 --- a/lapack-netlib/SRC/zsytrs_3.c +++ b/lapack-netlib/SRC/zsytrs_3.c @@ -677,7 +677,7 @@ static doublecomplex c_b1 = {1.,0.}; /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytrs_3_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsytrs_3_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *e, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -690,10 +690,10 @@ static doublecomplex c_b1 = {1.,0.}; integer i__, j, k; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char * , integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -739,13 +739,13 @@ static doublecomplex c_b1 = {1.,0.}; if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRS_3", &i__1, (ftnlen)8); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -933,7 +933,7 @@ static doublecomplex c_b1 = {1.,0.}; } - return 0; + return; /* End of ZSYTRS_3 */ diff --git a/lapack-netlib/SRC/zsytrs_aa.c b/lapack-netlib/SRC/zsytrs_aa.c index 66aca9b2d3..7983cd087b 100644 --- a/lapack-netlib/SRC/zsytrs_aa.c +++ b/lapack-netlib/SRC/zsytrs_aa.c @@ -644,7 +644,7 @@ aa.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytrs_aa_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsytrs_aa_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, doublecomplex *work, integer *lwork, integer *info) { @@ -655,14 +655,15 @@ aa.f"> */ integer k; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgtsv_(integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex * , integer *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer kp; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacpy_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacpy_( char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lwkopt; @@ -713,17 +714,17 @@ aa.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRS_AA", &i__1, (ftnlen)9); - return 0; + return; } else if (lquery) { lwkopt = *n * 3 - 2; work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -855,7 +856,7 @@ aa.f"> */ } - return 0; + return; /* End of ZSYTRS_AA */ diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.c b/lapack-netlib/SRC/zsytrs_aa_2stage.c index 025cdacc69..0bb91f9406 100644 --- a/lapack-netlib/SRC/zsytrs_aa_2stage.c +++ b/lapack-netlib/SRC/zsytrs_aa_2stage.c @@ -653,7 +653,7 @@ aa_2stage.f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int zsytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsytrs_aa_2stage_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *tb, integer *ltb, integer *ipiv, integer *ipiv2, doublecomplex *b, integer *ldb, integer *info) @@ -665,13 +665,15 @@ aa_2stage.f"> */ integer ldtb; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zgbtrs_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zgbtrs_( char *, integer *, integer *, integer *, integer *, doublecomplex - *, integer *, integer *, doublecomplex *, integer *, integer *), zlaswp_(integer *, doublecomplex *, integer *, integer *, + *, integer *, integer *, doublecomplex *, integer *, integer *); + extern int zlaswp_(integer *, doublecomplex *, integer *, integer *, integer *, integer *, integer *); @@ -715,13 +717,13 @@ aa_2stage.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRS_AA_2STAGE", &i__1, (ftnlen)16); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } /* Read NB and compute LDTB */ @@ -806,7 +808,7 @@ aa_2stage.f"> */ } } - return 0; + return; /* End of ZSYTRS_AA_2STAGE */ diff --git a/lapack-netlib/SRC/zsytrs_rook.c b/lapack-netlib/SRC/zsytrs_rook.c index 2e2f44b6d1..2a01c9461a 100644 --- a/lapack-netlib/SRC/zsytrs_rook.c +++ b/lapack-netlib/SRC/zsytrs_rook.c @@ -649,7 +649,7 @@ rook.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zsytrs_rook_(char *uplo, integer *n, integer *nrhs, +/* Subroutine */ void zsytrs_rook_(char *uplo, integer *n, integer *nrhs, doublecomplex *a, integer *lda, integer *ipiv, doublecomplex *b, integer *ldb, integer *info) { @@ -662,12 +662,12 @@ rook.f"> */ integer j, k; extern logical lsame_(char *, char *); doublecomplex denom; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int zgeru_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgeru_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zswap_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -712,13 +712,13 @@ rook.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZSYTRS_ROOK", &i__1, (ftnlen)11); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { - return 0; + return; } if (upper) { @@ -1093,7 +1093,7 @@ rook.f"> */ ; } - return 0; + return; /* End of ZSYTRS_ROOK */ diff --git a/lapack-netlib/SRC/ztbcon.c b/lapack-netlib/SRC/ztbcon.c index 556fa23a5d..5fcd4e0b2a 100644 --- a/lapack-netlib/SRC/ztbcon.c +++ b/lapack-netlib/SRC/ztbcon.c @@ -655,7 +655,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztbcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void ztbcon_(char *norm, char *uplo, char *diag, integer *n, integer *kd, doublecomplex *ab, integer *ldab, doublereal *rcond, doublecomplex *work, doublereal *rwork, integer *info) { @@ -671,7 +671,7 @@ f"> */ doublereal anorm; logical upper; doublereal xnorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -681,7 +681,7 @@ f"> */ extern doublereal zlantb_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *); logical onenrm; - extern /* Subroutine */ int zlatbs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatbs_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *), zdrscl_(integer *, doublereal *, doublecomplex *, integer *); @@ -730,14 +730,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTBCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.; - return 0; + return; } *rcond = 0.; @@ -802,7 +802,7 @@ f"> */ } L20: - return 0; + return; /* End of ZTBCON */ diff --git a/lapack-netlib/SRC/ztbrfs.c b/lapack-netlib/SRC/ztbrfs.c index 80b96ad0cc..a34d2fae84 100644 --- a/lapack-netlib/SRC/ztbrfs.c +++ b/lapack-netlib/SRC/ztbrfs.c @@ -700,7 +700,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztbrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ztbrfs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal * @@ -720,7 +720,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int ztbmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztbmv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztbsv_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, @@ -794,7 +794,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTBRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -806,7 +806,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1151,7 +1151,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of ZTBRFS */ diff --git a/lapack-netlib/SRC/ztbtrs.c b/lapack-netlib/SRC/ztbtrs.c index 17de520b12..25c365744d 100644 --- a/lapack-netlib/SRC/ztbtrs.c +++ b/lapack-netlib/SRC/ztbtrs.c @@ -658,7 +658,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztbtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ztbtrs_(char *uplo, char *trans, char *diag, integer *n, integer *kd, integer *nrhs, doublecomplex *ab, integer *ldab, doublecomplex *b, integer *ldb, integer *info) { @@ -669,8 +669,9 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztbsv_(char *, char *, char *, integer *, - integer *, doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ztbsv_(char *, char *, char *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; @@ -718,13 +719,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTBTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -735,7 +736,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *kd + 1 + *info * ab_dim1; if (ab[i__2].r == 0. && ab[i__2].i == 0.) { - return 0; + return; } /* L10: */ } @@ -744,7 +745,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info * ab_dim1 + 1; if (ab[i__2].r == 0. && ab[i__2].i == 0.) { - return 0; + return; } /* L20: */ } @@ -761,7 +762,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of ZTBTRS */ diff --git a/lapack-netlib/SRC/ztfsm.c b/lapack-netlib/SRC/ztfsm.c index 542c4df87a..f96df7d01e 100644 --- a/lapack-netlib/SRC/ztfsm.c +++ b/lapack-netlib/SRC/ztfsm.c @@ -810,7 +810,7 @@ static doublecomplex c_b1 = {1.,0.}; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztfsm_(char *transr, char *side, char *uplo, char *trans, +/* Subroutine */ void ztfsm_(char *transr, char *side, char *uplo, char *trans, char *diag, integer *m, integer *n, doublecomplex *alpha, doublecomplex *a, doublecomplex *b, integer *ldb) { @@ -822,16 +822,16 @@ static doublecomplex c_b1 = {1.,0.}; integer info, i__, j, k; logical normaltransr, lside; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical lower; integer m1, m2, n1, n2; - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical misodd, nisodd, notrans; @@ -877,13 +877,13 @@ static doublecomplex c_b1 = {1.,0.}; if (info != 0) { i__1 = -info; xerbla_("ZTFSM ", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return when ( (N.EQ.0).OR.(M.EQ.0) ) */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Quick return when ALPHA.EQ.(0D+0,0D+0) */ @@ -899,7 +899,7 @@ static doublecomplex c_b1 = {1.,0.}; } /* L20: */ } - return 0; + return; } if (lside) { @@ -1580,7 +1580,7 @@ static doublecomplex c_b1 = {1.,0.}; } } - return 0; + return; /* End of ZTFSM */ diff --git a/lapack-netlib/SRC/ztftri.c b/lapack-netlib/SRC/ztftri.c index ccea628e42..8a69fb3136 100644 --- a/lapack-netlib/SRC/ztftri.c +++ b/lapack-netlib/SRC/ztftri.c @@ -734,7 +734,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztftri_(char *transr, char *uplo, char *diag, integer *n, +/* Subroutine */ void ztftri_(char *transr, char *uplo, char *diag, integer *n, doublecomplex *a, integer *info) { /* System generated locals */ @@ -747,10 +747,10 @@ f"> */ extern logical lsame_(char *, char *); logical lower; integer n1, n2; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nisodd; extern /* Subroutine */ int ztrtri_(char *, char *, integer *, doublecomplex *, integer *, integer *); @@ -783,13 +783,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTFTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* If N is odd, set NISODD = .TRUE. */ @@ -831,7 +831,7 @@ f"> */ ztrtri_("L", diag, &n1, a, n, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; ztrmm_("R", "L", "N", diag, &n2, &n1, &z__1, a, n, &a[n1], n); @@ -841,7 +841,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ztrmm_("L", "U", "C", diag, &n2, &n1, &c_b1, &a[*n], n, &a[n1] , n); @@ -855,7 +855,7 @@ f"> */ ztrtri_("L", diag, &n1, &a[n2], n, info) ; if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; ztrmm_("L", "L", "C", diag, &n1, &n2, &z__1, &a[n2], n, a, n); @@ -865,7 +865,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ztrmm_("R", "U", "N", diag, &n1, &n2, &c_b1, &a[n1], n, a, n); @@ -882,7 +882,7 @@ f"> */ ztrtri_("U", diag, &n1, a, &n1, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; ztrmm_("L", "U", "N", diag, &n1, &n2, &z__1, a, &n1, &a[n1 * @@ -892,7 +892,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ztrmm_("R", "L", "C", diag, &n1, &n2, &c_b1, &a[1], &n1, &a[ n1 * n1], &n1); @@ -904,7 +904,7 @@ f"> */ ztrtri_("U", diag, &n1, &a[n2 * n2], &n2, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; ztrmm_("R", "U", "C", diag, &n2, &n1, &z__1, &a[n2 * n2], &n2, @@ -914,7 +914,7 @@ f"> */ *info += n1; } if (*info > 0) { - return 0; + return; } ztrmm_("L", "L", "N", diag, &n2, &n1, &c_b1, &a[n1 * n2], &n2, a, &n2); @@ -939,7 +939,7 @@ f"> */ i__1 = *n + 1; ztrtri_("L", diag, &k, &a[1], &i__1, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; i__1 = *n + 1; @@ -952,7 +952,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -968,7 +968,7 @@ f"> */ i__1 = *n + 1; ztrtri_("L", diag, &k, &a[k + 1], &i__1, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; i__1 = *n + 1; @@ -981,7 +981,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } i__1 = *n + 1; i__2 = *n + 1; @@ -1000,7 +1000,7 @@ f"> */ ztrtri_("U", diag, &k, &a[k], &k, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; ztrmm_("L", "U", "N", diag, &k, &k, &z__1, &a[k], &k, &a[k * ( @@ -1010,7 +1010,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } ztrmm_("R", "L", "C", diag, &k, &k, &c_b1, a, &k, &a[k * (k + 1)], &k); @@ -1022,7 +1022,7 @@ f"> */ ztrtri_("U", diag, &k, &a[k * (k + 1)], &k, info); if (*info > 0) { - return 0; + return; } z__1.r = -1., z__1.i = 0.; ztrmm_("R", "U", "C", diag, &k, &k, &z__1, &a[k * (k + 1)], & @@ -1032,7 +1032,7 @@ f"> */ *info += k; } if (*info > 0) { - return 0; + return; } ztrmm_("L", "L", "N", diag, &k, &k, &c_b1, &a[k * k], &k, a, & k); @@ -1040,7 +1040,7 @@ f"> */ } } - return 0; + return; /* End of ZTFTRI */ diff --git a/lapack-netlib/SRC/ztfttp.c b/lapack-netlib/SRC/ztfttp.c index 98d2566af0..4b5372f9e5 100644 --- a/lapack-netlib/SRC/ztfttp.c +++ b/lapack-netlib/SRC/ztfttp.c @@ -718,7 +718,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztfttp_(char *transr, char *uplo, integer *n, +/* Subroutine */ void ztfttp_(char *transr, char *uplo, integer *n, doublecomplex *arf, doublecomplex *ap, integer *info) { /* System generated locals */ @@ -760,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTFTTP", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -776,7 +776,7 @@ f"> */ d_cnjg(&z__1, arf); ap[0].r = z__1.r, ap[0].i = z__1.i; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1112,7 +1112,7 @@ f"> */ } - return 0; + return; /* End of ZTFTTP */ diff --git a/lapack-netlib/SRC/ztfttr.c b/lapack-netlib/SRC/ztfttr.c index bc2b168b1a..fd4be1edc3 100644 --- a/lapack-netlib/SRC/ztfttr.c +++ b/lapack-netlib/SRC/ztfttr.c @@ -726,7 +726,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztfttr_(char *transr, char *uplo, integer *n, +/* Subroutine */ void ztfttr_(char *transr, char *uplo, integer *n, doublecomplex *arf, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -776,7 +776,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTFTTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -790,7 +790,7 @@ f"> */ a[0].r = z__1.r, a[0].i = z__1.i; } } - return 0; + return; } /* Size of array ARF(1:2,0:nt-1) */ @@ -1122,7 +1122,7 @@ f"> */ } - return 0; + return; /* End of ZTFTTR */ diff --git a/lapack-netlib/SRC/ztgevc.c b/lapack-netlib/SRC/ztgevc.c index 9f2c3af387..b9af414f8b 100644 --- a/lapack-netlib/SRC/ztgevc.c +++ b/lapack-netlib/SRC/ztgevc.c @@ -733,7 +733,7 @@ f"> */ /* > \ingroup complex16GEcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztgevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void ztgevc_(char *side, char *howmny, logical *select, integer *n, doublecomplex *s, integer *lds, doublecomplex *p, integer *ldp, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer * ldvr, integer *mm, integer *m, doublecomplex *work, doublereal *rwork, @@ -763,11 +763,11 @@ f"> */ logical compl; doublereal anorm, bnorm; logical compr; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); doublecomplex ca, cb; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); logical ilbbad; doublereal acoefa; integer je; @@ -869,7 +869,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Count the number of eigenvectors */ @@ -910,14 +910,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGEVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *m = im; if (*n == 0) { - return 0; + return; } /* Machine Constants */ @@ -1546,7 +1546,7 @@ f"> */ } } - return 0; + return; /* End of ZTGEVC */ diff --git a/lapack-netlib/SRC/ztgex2.c b/lapack-netlib/SRC/ztgex2.c index 8e8f1ea747..c11096b232 100644 --- a/lapack-netlib/SRC/ztgex2.c +++ b/lapack-netlib/SRC/ztgex2.c @@ -704,7 +704,7 @@ f"> */ /* > Numerical Algorithms, 1996. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztgex2_(logical *wantq, logical *wantz, integer *n, +/* Subroutine */ void ztgex2_(logical *wantq, logical *wantz, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *q, integer *ldq, doublecomplex *z__, integer *ldz, integer *j1, integer *info) @@ -718,7 +718,7 @@ f"> */ /* Local variables */ logical weak; doublecomplex cdum, work[8]; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); doublecomplex f, g; integer i__, m; @@ -731,12 +731,12 @@ f"> */ doublecomplex sz; logical dtrong; doublereal thresh; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); doublereal smlnum; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal eps, sum; @@ -770,7 +770,7 @@ f"> */ /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } m = 2; @@ -941,13 +941,13 @@ f"> */ /* Exit with INFO = 0 if swap was successfully performed. */ - return 0; + return; /* Exit with INFO = 1 if swap was rejected. */ L20: *info = 1; - return 0; + return; /* End of ZTGEX2 */ diff --git a/lapack-netlib/SRC/ztgexc.c b/lapack-netlib/SRC/ztgexc.c index dadf100577..9b0a524a80 100644 --- a/lapack-netlib/SRC/ztgexc.c +++ b/lapack-netlib/SRC/ztgexc.c @@ -708,7 +708,7 @@ f"> */ /* > 1996. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztgexc_(logical *wantq, logical *wantz, integer *n, +/* Subroutine */ void ztgexc_(logical *wantq, logical *wantz, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *q, integer *ldq, doublecomplex *z__, integer *ldz, integer *ifst, integer *ilst, integer *info) @@ -719,10 +719,11 @@ f"> */ /* Local variables */ integer here; - extern /* Subroutine */ int ztgex2_(logical *, logical *, integer *, + extern /* Subroutine */ void ztgex2_(logical *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.1) -- */ @@ -769,16 +770,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGEXC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1) { - return 0; + return; } if (*ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -793,7 +794,7 @@ f"> */ q_offset], ldq, &z__[z_offset], ldz, &here, info); if (*info != 0) { *ilst = here; - return 0; + return; } ++here; if (here < *ilst) { @@ -811,7 +812,7 @@ f"> */ q_offset], ldq, &z__[z_offset], ldz, &here, info); if (*info != 0) { *ilst = here; - return 0; + return; } --here; if (here >= *ilst) { @@ -820,7 +821,7 @@ f"> */ ++here; } *ilst = here; - return 0; + return; /* End of ZTGEXC */ diff --git a/lapack-netlib/SRC/ztgsen.c b/lapack-netlib/SRC/ztgsen.c index 9123421291..8484a391b4 100644 --- a/lapack-netlib/SRC/ztgsen.c +++ b/lapack-netlib/SRC/ztgsen.c @@ -944,7 +944,7 @@ f"> */ /* > 1996. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztgsen_(integer *ijob, logical *wantq, logical *wantz, +/* Subroutine */ void ztgsen_(integer *ijob, logical *wantq, logical *wantz, logical *select, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *alpha, doublecomplex * beta, doublecomplex *q, integer *ldq, doublecomplex *z__, integer * @@ -963,13 +963,13 @@ f"> */ logical swap; doublecomplex temp1, temp2; integer i__, k, isave[3]; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); logical wantd; integer lwmin; logical wantp; integer n1, n2; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); logical wantd1, wantd2; extern doublereal dlamch_(char *); @@ -978,16 +978,16 @@ f"> */ doublereal rdscal, safmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer liwmin; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztgexc_(logical *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); integer mn2; - extern /* Subroutine */ int zlassq_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); logical lquery; - extern /* Subroutine */ int ztgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void ztgsyl_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -1048,7 +1048,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGSEN", &i__1, (ftnlen)6); - return 0; + return; } ierr = 0; @@ -1116,9 +1116,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ @@ -1373,7 +1373,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; iwork[1] = liwmin; - return 0; + return; /* End of ZTGSEN */ diff --git a/lapack-netlib/SRC/ztgsja.c b/lapack-netlib/SRC/ztgsja.c index a3f02bfa80..e2253749f3 100644 --- a/lapack-netlib/SRC/ztgsja.c +++ b/lapack-netlib/SRC/ztgsja.c @@ -895,7 +895,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztgsja_(char *jobu, char *jobv, char *jobq, integer *m, +/* Subroutine */ void ztgsja_(char *jobu, char *jobv, char *jobq, integer *m, integer *p, integer *n, integer *k, integer *l, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublereal *tola, doublereal *tolb, doublereal *alpha, doublereal *beta, doublecomplex * @@ -910,7 +910,7 @@ f"> */ doublecomplex z__1; /* Local variables */ - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); integer kcallmycycle, i__, j; doublereal gamma; @@ -922,18 +922,20 @@ f"> */ logical wantu, wantv; doublereal ssmin; doublecomplex a2, b2; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlags2_(logical *, doublereal *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, doublereal *, doublereal *, doublecomplex *, doublereal *, doublecomplex *, doublereal *, doublecomplex *), dlartg_( doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *, ftnlen), zdscal_( + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *), zlapll_( integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *); // extern integer myhuge_(doublereal *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublereal csq, csu, csv; doublecomplex snq; @@ -1012,7 +1014,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGSJA", &i__1, (ftnlen)6); - return 0; + return; } /* Initialize U, V and Q, if necessary */ @@ -1289,7 +1291,7 @@ f"> */ L100: *ncallmycycle = kcallmycycle; - return 0; + return; /* End of ZTGSJA */ diff --git a/lapack-netlib/SRC/ztgsna.c b/lapack-netlib/SRC/ztgsna.c index 56755d2098..797943f449 100644 --- a/lapack-netlib/SRC/ztgsna.c +++ b/lapack-netlib/SRC/ztgsna.c @@ -826,7 +826,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztgsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void ztgsna_(char *job, char *howmny, logical *select, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer * ldvr, doublereal *s, doublereal *dif, integer *mm, integer *m, @@ -851,14 +851,14 @@ f"> */ extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lwmin; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical wants; doublecomplex dummy[1]; integer n1, n2; extern doublereal dlapy2_(doublereal *, doublereal *); - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); doublecomplex dummy1[1]; extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_( char *); @@ -866,14 +866,14 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal bignum; logical wantbh, wantdf, somcon; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztgexc_(logical *, logical *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); doublereal smlnum; logical lquery; - extern /* Subroutine */ int ztgsyl_(char *, integer *, integer *, integer + extern /* Subroutine */ void ztgsyl_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -974,15 +974,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGSNA", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Get machine constants */ @@ -1082,7 +1082,7 @@ f"> */ ; } work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZTGSNA */ diff --git a/lapack-netlib/SRC/ztgsy2.c b/lapack-netlib/SRC/ztgsy2.c index 459bbdbac8..0dab499914 100644 --- a/lapack-netlib/SRC/ztgsy2.c +++ b/lapack-netlib/SRC/ztgsy2.c @@ -771,7 +771,7 @@ f"> */ /* > Umea University, S-901 87 Umea, Sweden. */ /* ===================================================================== */ -/* Subroutine */ int ztgsy2_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void ztgsy2_(char *trans, integer *ijob, integer *m, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *c__, integer *ldc, doublecomplex *d__, integer *ldd, doublecomplex *e, integer *lde, doublecomplex *f, integer *ldf, @@ -788,14 +788,15 @@ f"> */ integer ierr, ipiv[2], jpiv[2], i__, j, k; doublecomplex alpha, z__[4] /* was [2][2] */; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), zgesc2_( integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, doublereal *), zgetc2_(integer *, doublecomplex *, integer *, integer *, integer *, integer *); doublereal scaloc; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlatdf_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlatdf_( integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *, integer *); logical notran; @@ -866,7 +867,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGSY2", &i__1, (ftnlen)6); - return 0; + return; } if (notran) { @@ -1057,7 +1058,7 @@ f"> */ /* L80: */ } } - return 0; + return; /* End of ZTGSY2 */ diff --git a/lapack-netlib/SRC/ztgsyl.c b/lapack-netlib/SRC/ztgsyl.c index d7d6d2484e..3b11a13acb 100644 --- a/lapack-netlib/SRC/ztgsyl.c +++ b/lapack-netlib/SRC/ztgsyl.c @@ -812,7 +812,7 @@ f"> */ /* > July 1989, pp 745-751. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztgsyl_(char *trans, integer *ijob, integer *m, integer * +/* Subroutine */ void ztgsyl_(char *trans, integer *ijob, integer *m, integer * n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *c__, integer *ldc, doublecomplex *d__, integer *ldd, doublecomplex *e, integer *lde, doublecomplex *f, integer *ldf, @@ -830,7 +830,7 @@ f"> */ integer i__, j, k, p, q; extern logical lsame_(char *, char *); integer ifunc, linfo; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -840,7 +840,7 @@ f"> */ integer ie, je, mb, nb; doublereal dscale; integer is, js, pq; - extern /* Subroutine */ int ztgsy2_(char *, integer *, integer *, integer + extern /* Subroutine */ void ztgsy2_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -852,7 +852,7 @@ f"> */ integer iround; logical notran; integer isolve; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); @@ -948,9 +948,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTGSYL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -962,7 +962,7 @@ f"> */ *dif = 0.; } } - return 0; + return; } /* Determine optimal block sizes MB and NB */ @@ -1024,7 +1024,7 @@ f"> */ /* L30: */ } - return 0; + return; } @@ -1294,7 +1294,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZTGSYL */ diff --git a/lapack-netlib/SRC/ztpcon.c b/lapack-netlib/SRC/ztpcon.c index bc5753f4c6..2224c2cef7 100644 --- a/lapack-netlib/SRC/ztpcon.c +++ b/lapack-netlib/SRC/ztpcon.c @@ -642,7 +642,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztpcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void ztpcon_(char *norm, char *uplo, char *diag, integer *n, doublecomplex *ap, doublereal *rcond, doublecomplex *work, doublereal *rwork, integer *info) { @@ -658,7 +658,7 @@ f"> */ doublereal anorm; logical upper; doublereal xnorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -666,14 +666,14 @@ f"> */ doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); logical onenrm; - extern /* Subroutine */ int zdrscl_(integer *, doublereal *, + extern /* Subroutine */ void zdrscl_(integer *, doublereal *, doublecomplex *, integer *); char normin[1]; extern doublereal zlantp_(char *, char *, char *, integer *, doublecomplex *, doublereal *); doublereal smlnum; logical nounit; - extern /* Subroutine */ int zlatps_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatps_(char *, char *, char *, char *, integer *, doublecomplex *, doublecomplex *, doublereal *, doublereal *, integer *); @@ -712,14 +712,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.; - return 0; + return; } *rcond = 0.; @@ -784,7 +784,7 @@ f"> */ } L20: - return 0; + return; /* End of ZTPCON */ diff --git a/lapack-netlib/SRC/ztplqt.c b/lapack-netlib/SRC/ztplqt.c index dbcada52a4..c21618cfee 100644 --- a/lapack-netlib/SRC/ztplqt.c +++ b/lapack-netlib/SRC/ztplqt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztplqt_(integer *m, integer *n, integer *l, integer *mb, +/* Subroutine */ void ztplqt_(integer *m, integer *n, integer *l, integer *mb, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *t, integer *ldt, doublecomplex *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, nb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ztprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ztprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -760,13 +761,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPLQT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *m; @@ -800,7 +801,7 @@ f"> */ a_dim1], lda, &b[i__ + ib + b_dim1], ldb, &work[1], &i__4); } } - return 0; + return; /* End of ZTPLQT */ diff --git a/lapack-netlib/SRC/ztplqt2.c b/lapack-netlib/SRC/ztplqt2.c index 9190db5f78..ac9a4a1009 100644 --- a/lapack-netlib/SRC/ztplqt2.c +++ b/lapack-netlib/SRC/ztplqt2.c @@ -693,7 +693,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztplqt2_(integer *m, integer *n, integer *l, +/* Subroutine */ void ztplqt2_(integer *m, integer *n, integer *l, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *t, integer *ldt, integer *info) { @@ -705,7 +705,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* Local variables */ integer i__, j, p; doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -713,7 +713,8 @@ is composed of a triangular block and a pentagonal block, using the compact WY r ztrmv_(char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer mp, np; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_( integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -758,13 +759,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("ZTPLQT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *m; @@ -928,6 +929,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of ZTPLQT2 */ - return 0; + return; } /* ztplqt2_ */ diff --git a/lapack-netlib/SRC/ztpmlqt.c b/lapack-netlib/SRC/ztpmlqt.c index 5f8dd74a4c..10adc30cca 100644 --- a/lapack-netlib/SRC/ztpmlqt.c +++ b/lapack-netlib/SRC/ztpmlqt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztpmlqt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void ztpmlqt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *mb, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, integer *info) @@ -742,7 +742,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer ib, lb, nb, kf; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran; - extern /* Subroutine */ int ztprfb_(char *, char *, char *, char *, + extern /* Subroutine */ void ztprfb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -812,12 +812,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPMLQT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran) { @@ -906,7 +906,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of ZTPMLQT */ diff --git a/lapack-netlib/SRC/ztpmqrt.c b/lapack-netlib/SRC/ztpmqrt.c index 777fccae44..7118487bd3 100644 --- a/lapack-netlib/SRC/ztpmqrt.c +++ b/lapack-netlib/SRC/ztpmqrt.c @@ -724,7 +724,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztpmqrt_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void ztpmqrt_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, integer *nb, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, integer *info) @@ -742,7 +742,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer ib, lb, mb, kf; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran; - extern /* Subroutine */ int ztprfb_(char *, char *, char *, char *, + extern /* Subroutine */ void ztprfb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -814,12 +814,12 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPMQRT", &i__1, (ftnlen)7); - return 0; + return; } if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && tran) { @@ -908,7 +908,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of ZTPMQRT */ diff --git a/lapack-netlib/SRC/ztpqrt.c b/lapack-netlib/SRC/ztpqrt.c index f53e081af0..ce4688f95f 100644 --- a/lapack-netlib/SRC/ztpqrt.c +++ b/lapack-netlib/SRC/ztpqrt.c @@ -697,7 +697,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztpqrt_(integer *m, integer *n, integer *l, integer *nb, +/* Subroutine */ void ztpqrt_(integer *m, integer *n, integer *l, integer *nb, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *t, integer *ldt, doublecomplex *work, integer *info) { @@ -707,7 +707,8 @@ f"> */ /* Local variables */ integer i__, iinfo, ib, lb, mb; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), ztprfb_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void ztprfb_( char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -760,13 +761,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPQRT", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } i__1 = *n; @@ -800,7 +801,7 @@ f"> */ , &ib); } } - return 0; + return; /* End of ZTPQRT */ diff --git a/lapack-netlib/SRC/ztpqrt2.c b/lapack-netlib/SRC/ztpqrt2.c index 36e082d585..659937549a 100644 --- a/lapack-netlib/SRC/ztpqrt2.c +++ b/lapack-netlib/SRC/ztpqrt2.c @@ -690,7 +690,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztpqrt2_(integer *m, integer *n, integer *l, +/* Subroutine */ void ztpqrt2_(integer *m, integer *n, integer *l, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *t, integer *ldt, integer *info) { @@ -702,7 +702,7 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* Local variables */ integer i__, j, p; doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, @@ -710,7 +710,8 @@ is composed of a triangular block and a pentagonal block, using the compact WY r ztrmv_(char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer mp, np; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlarfg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarfg_( integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -755,13 +756,13 @@ is composed of a triangular block and a pentagonal block, using the compact WY r if (*info != 0) { i__1 = -(*info); xerbla_("ZTPQRT2", &i__1, (ftnlen)7); - return 0; + return; } /* Quick return if possible */ if (*n == 0 || *m == 0) { - return 0; + return; } i__1 = *n; @@ -875,6 +876,6 @@ is composed of a triangular block and a pentagonal block, using the compact WY r /* End of ZTPQRT2 */ - return 0; + return; } /* ztpqrt2_ */ diff --git a/lapack-netlib/SRC/ztprfb.c b/lapack-netlib/SRC/ztprfb.c index eebe038baf..558b6e0d14 100644 --- a/lapack-netlib/SRC/ztprfb.c +++ b/lapack-netlib/SRC/ztprfb.c @@ -766,7 +766,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztprfb_(char *side, char *trans, char *direct, char * +/* Subroutine */ void ztprfb_(char *side, char *trans, char *direct, char * storev, integer *m, integer *n, integer *k, integer *l, doublecomplex *v, integer *ldv, doublecomplex *t, integer *ldt, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *work, @@ -782,7 +782,7 @@ f"> */ integer i__, j; extern logical lsame_(char *, char *); logical right; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrmm_(char *, char *, char *, char *, @@ -822,7 +822,7 @@ f"> */ /* Function Body */ if (*m <= 0 || *n <= 0 || *k <= 0 || *l < 0) { - return 0; + return; } if (lsame_(storev, "C")) { @@ -1609,7 +1609,7 @@ f"> */ } - return 0; + return; /* End of ZTPRFB */ diff --git a/lapack-netlib/SRC/ztprfb.f b/lapack-netlib/SRC/ztprfb.f index 2edbd05666..7b1bc17a08 100644 --- a/lapack-netlib/SRC/ztprfb.f +++ b/lapack-netlib/SRC/ztprfb.f @@ -1,4 +1,4 @@ -*> \brief \b ZTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks. +*> \brief \b ZTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks. * * =========== DOCUMENTATION =========== * diff --git a/lapack-netlib/SRC/ztprfs.c b/lapack-netlib/SRC/ztprfs.c index 884935f8d7..a979964b1d 100644 --- a/lapack-netlib/SRC/ztprfs.c +++ b/lapack-netlib/SRC/ztprfs.c @@ -686,7 +686,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztprfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ztprfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal *rwork, integer *info) @@ -704,7 +704,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztpmv_( char *, char *, char *, integer *, doublecomplex *, doublecomplex @@ -772,7 +772,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -784,7 +784,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1118,7 +1118,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of ZTPRFS */ diff --git a/lapack-netlib/SRC/ztptri.c b/lapack-netlib/SRC/ztptri.c index 950162caf2..1778f4fdd4 100644 --- a/lapack-netlib/SRC/ztptri.c +++ b/lapack-netlib/SRC/ztptri.c @@ -631,7 +631,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztptri_(char *uplo, char *diag, integer *n, +/* Subroutine */ void ztptri_(char *uplo, char *diag, integer *n, doublecomplex *ap, integer *info) { /* System generated locals */ @@ -641,10 +641,10 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztpmv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpmv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *); integer jc, jj; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -681,7 +681,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Check for singularity if non-unit. */ @@ -694,7 +694,7 @@ f"> */ jj += *info; i__2 = jj; if (ap[i__2].r == 0. && ap[i__2].i == 0.) { - return 0; + return; } /* L10: */ } @@ -704,7 +704,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = jj; if (ap[i__2].r == 0. && ap[i__2].i == 0.) { - return 0; + return; } jj = jj + *n - *info + 1; /* L20: */ @@ -776,7 +776,7 @@ f"> */ } } - return 0; + return; /* End of ZTPTRI */ diff --git a/lapack-netlib/SRC/ztptrs.c b/lapack-netlib/SRC/ztptrs.c index 0bda751312..072b8dcd26 100644 --- a/lapack-netlib/SRC/ztptrs.c +++ b/lapack-netlib/SRC/ztptrs.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztptrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ztptrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublecomplex *ap, doublecomplex *b, integer *ldb, integer *info) { @@ -654,7 +654,7 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztpsv_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztpsv_(char *, char *, char *, integer *, doublecomplex *, doublecomplex *, integer *); integer jc; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); @@ -699,13 +699,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -717,7 +717,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = jc + *info - 1; if (ap[i__2].r == 0. && ap[i__2].i == 0.) { - return 0; + return; } jc += *info; /* L10: */ @@ -728,7 +728,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = jc; if (ap[i__2].r == 0. && ap[i__2].i == 0.) { - return 0; + return; } jc = jc + *n - *info + 1; /* L20: */ @@ -745,7 +745,7 @@ f"> */ /* L30: */ } - return 0; + return; /* End of ZTPTRS */ diff --git a/lapack-netlib/SRC/ztpttf.c b/lapack-netlib/SRC/ztpttf.c index d7da0a712d..420fa0275f 100644 --- a/lapack-netlib/SRC/ztpttf.c +++ b/lapack-netlib/SRC/ztpttf.c @@ -718,7 +718,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztpttf_(char *transr, char *uplo, integer *n, +/* Subroutine */ void ztpttf_(char *transr, char *uplo, integer *n, doublecomplex *ap, doublecomplex *arf, integer *info) { /* System generated locals */ @@ -760,13 +760,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { @@ -776,7 +776,7 @@ f"> */ d_cnjg(&z__1, ap); arf[0].r = z__1.r, arf[0].i = z__1.i; } - return 0; + return; } /* Size of array ARF(0:NT-1) */ @@ -1112,7 +1112,7 @@ f"> */ } - return 0; + return; /* End of ZTPTTF */ diff --git a/lapack-netlib/SRC/ztpttr.c b/lapack-netlib/SRC/ztpttr.c index 62c5dbbe98..0f12558545 100644 --- a/lapack-netlib/SRC/ztpttr.c +++ b/lapack-netlib/SRC/ztpttr.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztpttr_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void ztpttr_(char *uplo, integer *n, doublecomplex *ap, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTPTTR", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -687,7 +687,7 @@ f"> */ } - return 0; + return; /* End of ZTPTTR */ diff --git a/lapack-netlib/SRC/ztrcon.c b/lapack-netlib/SRC/ztrcon.c index 8824cbf5b4..1f5e58fe4b 100644 --- a/lapack-netlib/SRC/ztrcon.c +++ b/lapack-netlib/SRC/ztrcon.c @@ -649,7 +649,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrcon_(char *norm, char *uplo, char *diag, integer *n, +/* Subroutine */ void ztrcon_(char *norm, char *uplo, char *diag, integer *n, doublecomplex *a, integer *lda, doublereal *rcond, doublecomplex * work, doublereal *rwork, integer *info) { @@ -665,7 +665,7 @@ f"> */ doublereal anorm; logical upper; doublereal xnorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); extern doublereal dlamch_(char *); integer ix; @@ -673,14 +673,14 @@ f"> */ doublereal ainvnm; extern integer izamax_(integer *, doublecomplex *, integer *); logical onenrm; - extern /* Subroutine */ int zdrscl_(integer *, doublereal *, + extern /* Subroutine */ void zdrscl_(integer *, doublereal *, doublecomplex *, integer *); char normin[1]; extern doublereal zlantr_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal smlnum; logical nounit; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *); @@ -723,14 +723,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRCON", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { *rcond = 1.; - return 0; + return; } *rcond = 0.; @@ -795,7 +795,7 @@ f"> */ } L20: - return 0; + return; /* End of ZTRCON */ diff --git a/lapack-netlib/SRC/ztrevc.c b/lapack-netlib/SRC/ztrevc.c index b530b6adc2..3f1b5a96e2 100644 --- a/lapack-netlib/SRC/ztrevc.c +++ b/lapack-netlib/SRC/ztrevc.c @@ -731,7 +731,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztrevc_(char *side, char *howmny, logical *select, +/* Subroutine */ void ztrevc_(char *side, char *howmny, logical *select, integer *n, doublecomplex *t, integer *ldt, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, integer *mm, integer *m, doublecomplex *work, doublereal *rwork, integer *info) @@ -751,22 +751,23 @@ f"> */ extern logical lsame_(char *, char *); doublereal remax; logical leftv, bothv; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical somev; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); integer ii, ki; extern doublereal dlamch_(char *); integer is; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zdscal_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zdscal_( integer *, doublereal *, doublecomplex *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); logical rightv; extern doublereal dzasum_(integer *, doublecomplex *, integer *); doublereal smlnum; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *); doublereal ulp; @@ -841,13 +842,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTREVC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Set the constants to control overflow. */ @@ -1101,7 +1102,7 @@ f"> */ } } - return 0; + return; /* End of ZTREVC */ diff --git a/lapack-netlib/SRC/ztrevc3.c b/lapack-netlib/SRC/ztrevc3.c index 8a1906f2b0..8a4e723f8f 100644 --- a/lapack-netlib/SRC/ztrevc3.c +++ b/lapack-netlib/SRC/ztrevc3.c @@ -762,7 +762,7 @@ static integer c__2 = 2; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztrevc3_(char *side, char *howmny, logical *select, +/* Subroutine */ void ztrevc3_(char *side, char *howmny, logical *select, integer *n, doublecomplex *t, integer *ldt, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, integer *mm, integer *m, doublecomplex *work, integer *lwork, doublereal *rwork, integer * @@ -784,16 +784,16 @@ static integer c__2 = 2; doublereal scale; extern logical lsame_(char *, char *); doublereal remax; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical leftv, bothv; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical somev; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlabad_(doublereal *, doublereal *); integer nb, ii, ki; extern doublereal dlamch_(char *); @@ -801,18 +801,18 @@ static integer c__2 = 2; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern integer izamax_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); logical rightv; extern doublereal dzasum_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer maxwrk; doublereal smlnum; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *); logical lquery; @@ -906,15 +906,15 @@ static integer c__2 = 2; if (*info != 0) { i__1 = -(*info); xerbla_("ZTREVC3", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible. */ if (*n == 0) { - return 0; + return; } /* Use blocked version of back-transformation if sufficient workspace. */ @@ -1272,7 +1272,7 @@ static integer c__2 = 2; } } - return 0; + return; /* End of ZTREVC3 */ diff --git a/lapack-netlib/SRC/ztrexc.c b/lapack-netlib/SRC/ztrexc.c index ed8fd1dd23..aac01362c3 100644 --- a/lapack-netlib/SRC/ztrexc.c +++ b/lapack-netlib/SRC/ztrexc.c @@ -639,7 +639,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrexc_(char *compq, integer *n, doublecomplex *t, +/* Subroutine */ void ztrexc_(char *compq, integer *n, doublecomplex *t, integer *ldt, doublecomplex *q, integer *ldq, integer *ifst, integer * ilst, integer *info) { @@ -649,7 +649,7 @@ f"> */ /* Local variables */ doublecomplex temp; - extern /* Subroutine */ int zrot_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublecomplex *); integer k; extern logical lsame_(char *, char *); @@ -657,7 +657,8 @@ f"> */ integer m1, m2, m3; doublereal cs; doublecomplex t11, t22, sn; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlartg_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlartg_( doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); @@ -700,13 +701,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTREXC", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 1 || *ifst == *ilst) { - return 0; + return; } if (*ifst < *ilst) { @@ -770,7 +771,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of ZTREXC */ diff --git a/lapack-netlib/SRC/ztrrfs.c b/lapack-netlib/SRC/ztrrfs.c index 21a5a23d86..e7b8020deb 100644 --- a/lapack-netlib/SRC/ztrrfs.c +++ b/lapack-netlib/SRC/ztrrfs.c @@ -694,7 +694,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrrfs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ztrrfs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *x, integer *ldx, doublereal *ferr, doublereal *berr, doublecomplex *work, doublereal *rwork, integer * @@ -714,7 +714,7 @@ f"> */ extern logical lsame_(char *, char *); integer isave[3]; logical upper; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrmv_( char *, char *, char *, integer *, doublecomplex *, integer *, @@ -787,7 +787,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRRFS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -799,7 +799,7 @@ f"> */ berr[j] = 0.; /* L10: */ } - return 0; + return; } if (notran) { @@ -1127,7 +1127,7 @@ f"> */ /* L250: */ } - return 0; + return; /* End of ZTRRFS */ diff --git a/lapack-netlib/SRC/ztrsen.c b/lapack-netlib/SRC/ztrsen.c index 0881026ae7..117e562496 100644 --- a/lapack-netlib/SRC/ztrsen.c +++ b/lapack-netlib/SRC/ztrsen.c @@ -776,7 +776,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztrsen_(char *job, char *compq, logical *select, integer +/* Subroutine */ void ztrsen_(char *job, char *compq, logical *select, integer *n, doublecomplex *t, integer *ldt, doublecomplex *q, integer *ldq, doublecomplex *w, integer *m, doublereal *s, doublereal *sep, doublecomplex *work, integer *lwork, integer *info) @@ -793,21 +793,21 @@ f"> */ doublereal rnorm; integer n1, n2; doublereal rwork[1]; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *); integer nn, ks; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); logical wantbh; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical wantsp; - extern /* Subroutine */ int ztrexc_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void ztrexc_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); logical lquery; - extern /* Subroutine */ int ztrsyl_(char *, char *, integer *, integer *, + extern /* Subroutine */ void ztrsyl_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, integer *); doublereal est; @@ -890,9 +890,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRSEN", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ @@ -990,7 +990,7 @@ f"> */ work[1].r = (doublereal) lwmin, work[1].i = 0.; - return 0; + return; /* End of ZTRSEN */ diff --git a/lapack-netlib/SRC/ztrsna.c b/lapack-netlib/SRC/ztrsna.c index 92e6c39164..26b8bdc0cd 100644 --- a/lapack-netlib/SRC/ztrsna.c +++ b/lapack-netlib/SRC/ztrsna.c @@ -760,7 +760,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztrsna_(char *job, char *howmny, logical *select, +/* Subroutine */ void ztrsna_(char *job, char *howmny, logical *select, integer *n, doublecomplex *t, integer *ldt, doublecomplex *vl, integer *ldvl, doublecomplex *vr, integer *ldvr, doublereal *s, doublereal *sep, integer *mm, integer *m, doublecomplex *work, @@ -785,7 +785,7 @@ f"> */ doublecomplex dummy[1]; logical wants; doublereal xnorm; - extern /* Subroutine */ int zlacn2_(integer *, doublecomplex *, + extern /* Subroutine */ void zlacn2_(integer *, doublecomplex *, doublecomplex *, doublereal *, integer *, integer *), dlabad_( doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_( @@ -796,14 +796,14 @@ f"> */ logical wantbh; extern integer izamax_(integer *, doublecomplex *, integer *); logical somcon; - extern /* Subroutine */ int zdrscl_(integer *, doublereal *, + extern /* Subroutine */ void zdrscl_(integer *, doublereal *, doublecomplex *, integer *); char normin[1]; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); doublereal smlnum; logical wantsp; - extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, + extern /* Subroutine */ void zlatrs_(char *, char *, char *, char *, integer *, doublecomplex *, integer *, doublecomplex *, doublereal *, doublereal *, integer *), ztrexc_(char *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *, integer *, integer *); @@ -883,19 +883,19 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRSNA", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (*n == 1) { if (somcon) { if (! select[1]) { - return 0; + return; } } if (wants) { @@ -904,7 +904,7 @@ f"> */ if (wantsp) { sep[1] = z_abs(&t[t_dim1 + 1]); } - return 0; + return; } /* Get machine constants */ @@ -1021,7 +1021,7 @@ f"> */ L50: ; } - return 0; + return; /* End of ZTRSNA */ diff --git a/lapack-netlib/SRC/ztrsyl.c b/lapack-netlib/SRC/ztrsyl.c index eff366e3b9..af8f9e761d 100644 --- a/lapack-netlib/SRC/ztrsyl.c +++ b/lapack-netlib/SRC/ztrsyl.c @@ -669,7 +669,7 @@ f"> */ /* > \ingroup complex16SYcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrsyl_(char *trana, char *tranb, integer *isgn, integer +/* Subroutine */ void ztrsyl_(char *trana, char *tranb, integer *isgn, integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *scale, integer *info) @@ -691,7 +691,7 @@ f"> */ doublecomplex *, integer *); doublecomplex a11; doublereal db; - extern /* Subroutine */ int dlabad_(doublereal *, doublereal *); + extern /* Subroutine */ void dlabad_(doublereal *, doublereal *); extern doublereal dlamch_(char *); doublecomplex x11; doublereal scaloc; @@ -699,7 +699,7 @@ f"> */ extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); doublereal bignum; - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern /* Double Complex */ VOID zladiv_(doublecomplex *, doublecomplex *, doublecomplex *); @@ -756,14 +756,14 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRSYL", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ *scale = 1.; if (*m == 0 || *n == 0) { - return 0; + return; } /* Set constants to control overflow */ @@ -1103,7 +1103,7 @@ f"> */ } - return 0; + return; /* End of ZTRSYL */ diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c new file mode 100644 index 0000000000..09719e1d95 --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.c @@ -0,0 +1,2027 @@ +#include +#include +#include +#include +#include +#ifdef complex +#undef complex +#endif +#ifdef I +#undef I +#endif + +#if defined(_WIN64) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef LAPACK_ILP64 +typedef BLASLONG blasint; +#if defined(_WIN64) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif + +typedef blasint integer; + +typedef unsigned int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +#ifdef _MSC_VER +static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;} +static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;} +static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;} +static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;} +#else +static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;} +static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;} +static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;} +#endif +#define pCf(z) (*_pCf(z)) +#define pCd(z) (*_pCd(z)) +typedef int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +typedef int flag; +typedef int ftnlen; +typedef int ftnint; + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (fabs(x)) +#define f2cmin(a,b) ((a) <= (b) ? (a) : (b)) +#define f2cmax(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (f2cmin(a,b)) +#define dmax(a,b) (f2cmax(a,b)) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +#define abort_() { sig_die("Fortran abort routine called", 1); } +#define c_abs(z) (cabsf(Cf(z))) +#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); } +#ifdef _MSC_VER +#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);} +#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);} +#else +#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);} +#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);} +#endif +#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));} +#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));} +#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));} +//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));} +#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));} +#define d_abs(x) (fabs(*(x))) +#define d_acos(x) (acos(*(x))) +#define d_asin(x) (asin(*(x))) +#define d_atan(x) (atan(*(x))) +#define d_atn2(x, y) (atan2(*(x),*(y))) +#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); } +#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); } +#define d_cos(x) (cos(*(x))) +#define d_cosh(x) (cosh(*(x))) +#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 ) +#define d_exp(x) (exp(*(x))) +#define d_imag(z) (cimag(Cd(z))) +#define r_imag(z) (cimagf(Cf(z))) +#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x))) +#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) ) +#define d_log(x) (log(*(x))) +#define d_mod(x, y) (fmod(*(x), *(y))) +#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x))) +#define d_nint(x) u_nint(*(x)) +#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a))) +#define d_sign(a,b) u_sign(*(a),*(b)) +#define r_sign(a,b) u_sign(*(a),*(b)) +#define d_sin(x) (sin(*(x))) +#define d_sinh(x) (sinh(*(x))) +#define d_sqrt(x) (sqrt(*(x))) +#define d_tan(x) (tan(*(x))) +#define d_tanh(x) (tanh(*(x))) +#define i_abs(x) abs(*(x)) +#define i_dnnt(x) ((integer)u_nint(*(x))) +#define i_len(s, n) (n) +#define i_nint(x) ((integer)u_nint(*(x))) +#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b))) +#define pow_dd(ap, bp) ( pow(*(ap), *(bp))) +#define pow_si(B,E) spow_ui(*(B),*(E)) +#define pow_ri(B,E) spow_ui(*(B),*(E)) +#define pow_di(B,E) dpow_ui(*(B),*(E)) +#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));} +#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));} +#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));} +#define s_cat(lpp, rpp, rnp, np, llp) { ftnlen i, nc, ll; char *f__rp, *lp; ll = (llp); lp = (lpp); for(i=0; i < (int)*(np); ++i) { nc = ll; if((rnp)[i] < nc) nc = (rnp)[i]; ll -= nc; f__rp = (rpp)[i]; while(--nc >= 0) *lp++ = *(f__rp)++; } while(--ll >= 0) *lp++ = ' '; } +#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d)))) +#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; } +#define sig_die(s, kill) { exit(1); } +#define s_stop(s, n) {exit(0);} +static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; +#define z_abs(z) (cabs(Cd(z))) +#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));} +#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));} +#define myexit_() break; +#define mycycle_() continue; +#define myceiling_(w) {ceil(w)} +#define myhuge_(w) {HUGE_VAL} +//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} +#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) +#define myexp_(w) my_expfunc(w) + +static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;} + + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef logical (*L_fp)(...); +#else +typedef logical (*L_fp)(); +#endif + +static float spow_ui(float x, integer n) { + float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static double dpow_ui(double x, integer n) { + double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#ifdef _MSC_VER +static _Fcomplex cpow_ui(complex x, integer n) { + complex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i; + for(u = n; ; ) { + if(u & 01) pow.r *= x.r, pow.i *= x.i; + if(u >>= 1) x.r *= x.r, x.i *= x.i; + else break; + } + } + _Fcomplex p={pow.r, pow.i}; + return p; +} +#else +static _Complex float cpow_ui(_Complex float x, integer n) { + _Complex float pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +#ifdef _MSC_VER +static _Dcomplex zpow_ui(_Dcomplex x, integer n) { + _Dcomplex pow={1.0,0.0}; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1]; + for(u = n; ; ) { + if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1]; + if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1]; + else break; + } + } + _Dcomplex p = {pow._Val[0], pow._Val[1]}; + return p; +} +#else +static _Complex double zpow_ui(_Complex double x, integer n) { + _Complex double pow=1.0; unsigned long int u; + if(n != 0) { + if(n < 0) n = -n, x = 1/x; + for(u = n; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +#endif +static integer pow_ii(integer x, integer n) { + integer pow; unsigned long int u; + if (n <= 0) { + if (n == 0 || x == 1) pow = 1; + else if (x != -1) pow = x == 0 ? 1/x : 0; + else n = -n; + } + if ((n > 0) || !(n == 0 || x == 1 || x != -1)) { + u = n; + for(pow = 1; ; ) { + if(u & 01) pow *= x; + if(u >>= 1) x *= x; + else break; + } + } + return pow; +} +static integer dmaxloc_(double *w, integer s, integer e, integer *n) +{ + double m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static integer smaxloc_(float *w, integer s, integer e, integer *n) +{ + float m; integer i, mi; + for(m=w[s-1], mi=s, i=s+1; i<=e; i++) + if (w[i-1]>m) mi=i ,m=w[i-1]; + return mi-s+1; +} +static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) { + integer n = *n_, incx = *incx_, incy = *incy_, i; +#ifdef _MSC_VER + _Fcomplex zdotc = {0.0, 0.0}; + if (incx == 1 && incy == 1) { + for (i=0;i \brief \b ZTRSYL3 */ + +/* Definition: */ +/* =========== */ + + +/* > \par Purpose */ +/* ============= */ +/* > */ +/* > \verbatim */ +/* > */ +/* > ZTRSYL3 solves the complex Sylvester matrix equation: */ +/* > */ +/* > op(A)*X + X*op(B) = scale*C or */ +/* > op(A)*X - X*op(B) = scale*C, */ +/* > */ +/* > where op(A) = A or A**H, and A and B are both upper triangular. A is */ +/* > M-by-M and B is N-by-N; the right hand side C and the solution X are */ +/* > M-by-N; and scale is an output scale factor, set <= 1 to avoid */ +/* > overflow in X. */ +/* > */ +/* > This is the block version of the algorithm. */ +/* > \endverbatim */ + +/* Arguments */ +/* ========= */ + +/* > \param[in] TRANA */ +/* > \verbatim */ +/* > TRANA is CHARACTER*1 */ +/* > Specifies the option op(A): */ +/* > = 'N': op(A) = A (No transpose) */ +/* > = 'C': op(A) = A**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] TRANB */ +/* > \verbatim */ +/* > TRANB is CHARACTER*1 */ +/* > Specifies the option op(B): */ +/* > = 'N': op(B) = B (No transpose) */ +/* > = 'C': op(B) = B**H (Conjugate transpose) */ +/* > \endverbatim */ +/* > */ +/* > \param[in] ISGN */ +/* > \verbatim */ +/* > ISGN is INTEGER */ +/* > Specifies the sign in the equation: */ +/* > = +1: solve op(A)*X + X*op(B) = scale*C */ +/* > = -1: solve op(A)*X - X*op(B) = scale*C */ +/* > \endverbatim */ +/* > */ +/* > \param[in] M */ +/* > \verbatim */ +/* > M is INTEGER */ +/* > The order of the matrix A, and the number of rows in the */ +/* > matrices X and C. M >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] N */ +/* > \verbatim */ +/* > N is INTEGER */ +/* > The order of the matrix B, and the number of columns in the */ +/* > matrices X and C. N >= 0. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] A */ +/* > \verbatim */ +/* > A is COMPLEX*16 array, dimension (LDA,M) */ +/* > The upper triangular matrix A. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDA */ +/* > \verbatim */ +/* > LDA is INTEGER */ +/* > The leading dimension of the array A. LDA >= f2cmax(1,M). */ +/* > \endverbatim */ +/* > */ +/* > \param[in] B */ +/* > \verbatim */ +/* > B is COMPLEX*16 array, dimension (LDB,N) */ +/* > The upper triangular matrix B. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDB */ +/* > \verbatim */ +/* > LDB is INTEGER */ +/* > The leading dimension of the array B. LDB >= f2cmax(1,N). */ +/* > \endverbatim */ +/* > */ +/* > \param[in,out] C */ +/* > \verbatim */ +/* > C is COMPLEX*16 array, dimension (LDC,N) */ +/* > On entry, the M-by-N right hand side matrix C. */ +/* > On exit, C is overwritten by the solution matrix X. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDC */ +/* > \verbatim */ +/* > LDC is INTEGER */ +/* > The leading dimension of the array C. LDC >= f2cmax(1,M) */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SCALE */ +/* > \verbatim */ +/* > SCALE is DOUBLE PRECISION */ +/* > The scale factor, scale, set <= 1 to avoid overflow in X. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] SWORK */ +/* > \verbatim */ +/* > SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */ +/* > MAX(1,COLS)). */ +/* > On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */ +/* > and SWORK(2) returns the optimal COLS. */ +/* > \endverbatim */ +/* > */ +/* > \param[in] LDSWORK */ +/* > \verbatim */ +/* > LDSWORK is INTEGER */ +/* > LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */ +/* > and NB is the optimal block size. */ +/* > */ +/* > If LDSWORK = -1, then a workspace query is assumed; the routine */ +/* > only calculates the optimal dimensions of the SWORK matrix, */ +/* > returns these values as the first and second entry of the SWORK */ +/* > matrix, and no error message related LWORK is issued by XERBLA. */ +/* > \endverbatim */ +/* > */ +/* > \param[out] INFO */ +/* > \verbatim */ +/* > INFO is INTEGER */ +/* > = 0: successful exit */ +/* > < 0: if INFO = -i, the i-th argument had an illegal value */ +/* > = 1: A and B have common or very close eigenvalues; perturbed */ +/* > values were used to solve the equation (but the matrices */ +/* > A and B are unchanged). */ +/* > \endverbatim */ + +/* > \ingroup complex16SYcomputational */ + +/* ===================================================================== */ +/* References: */ +/* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */ +/* algorithms: The triangular Sylvester equation, ACM Transactions */ +/* on Mathematical Software (TOMS), volume 29, pages 218--243. */ + +/* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */ +/* Solution of the Triangular Sylvester Equation. Lecture Notes in */ +/* Computer Science, vol 12043, pages 82--92, Springer. */ + +/* Contributor: */ +/* Angelika Schwarz, Umea University, Sweden. */ + +/* ===================================================================== */ +/* Subroutine */ void ztrsyl3_(char *trana, char *tranb, integer *isgn, + integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex + *b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *scale, + doublereal *swork, integer *ldswork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, + swork_offset, i__1, i__2, i__3, i__4, i__5, i__6; + doublereal d__1, d__2, d__3, d__4; + doublecomplex z__1; + + /* Local variables */ + doublereal scal; + doublecomplex csgn; + doublereal anrm, bnrm, cnrm; + integer awrk, bwrk; + doublereal *wnrm, xnrm; + integer i__, j, k, l; + extern logical lsame_(char *, char *); + integer iinfo; + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, + integer *, doublecomplex *, doublecomplex *, integer *, + doublecomplex *, integer *, doublecomplex *, doublecomplex *, + integer *); + integer i1, i2, j1, j2, k1, k2, l1, l2; +// extern integer myexp_(doublereal *); + integer nb, jj, ll; + extern doublereal dlamch_(char *); + doublereal scaloc, scamin; + extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen ); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, + integer *, doublereal *); + doublereal bignum; + extern /* Subroutine */ void zdscal_(integer *, doublereal *, + doublecomplex *, integer *), zlascl_(char *, integer *, integer *, + doublereal *, doublereal *, integer *, integer *, doublecomplex * + , integer *, integer *); + logical notrna, notrnb; + doublereal smlnum; + logical lquery; + extern /* Subroutine */ void ztrsyl_(char *, char *, integer *, integer *, + integer *, doublecomplex *, integer *, doublecomplex *, integer *, + doublecomplex *, integer *, doublereal *, integer *); + integer nba, nbb; + doublereal buf, sgn; + + + +/* Decode and Test input parameters */ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1 * 1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1 * 1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1 * 1; + c__ -= c_offset; + swork_dim1 = *ldswork; + swork_offset = 1 + swork_dim1 * 1; + swork -= swork_offset; + + /* Function Body */ + notrna = lsame_(trana, "N"); + notrnb = lsame_(tranb, "N"); + +/* Use the same block size for all matrices. */ + +/* Computing MAX */ + i__1 = 8, i__2 = ilaenv_(&c__1, "ZTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen) + 6, (ftnlen)0); + nb = f2cmax(i__1,i__2); + +/* Compute number of blocks in A and B */ + +/* Computing MAX */ + i__1 = 1, i__2 = (*m + nb - 1) / nb; + nba = f2cmax(i__1,i__2); +/* Computing MAX */ + i__1 = 1, i__2 = (*n + nb - 1) / nb; + nbb = f2cmax(i__1,i__2); + +/* Compute workspace */ + + *info = 0; + lquery = *ldswork == -1; + if (lquery) { + *ldswork = 2; + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + } + +/* Test the input arguments */ + + if (! notrna && ! lsame_(trana, "C")) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C")) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < f2cmax(1,*m)) { + *info = -7; + } else if (*ldb < f2cmax(1,*n)) { + *info = -9; + } else if (*ldc < f2cmax(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZTRSYL3", &i__1, 7); + return; + } else if (lquery) { + return; + } + +/* Quick return if possible */ + + *scale = 1.; + if (*m == 0 || *n == 0) { + return; + } + + wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal)); +/* Use unblocked code for small problems or if insufficient */ +/* workspace is provided */ + + if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) { + ztrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], + ldb, &c__[c_offset], ldc, scale, info); + return; + } + +/* Set constants to control overflow */ + + smlnum = dlamch_("S"); + bignum = 1. / smlnum; + +/* Set local scaling factors. */ + + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + i__2 = nba; + for (k = 1; k <= i__2; ++k) { + swork[k + l * swork_dim1] = 1.; + } + } + +/* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */ +/* This scaling is to ensure compatibility with TRSYL and may get flushed. */ + + buf = 1.; + +/* Compute upper bounds of blocks of A and B */ + + awrk = nbb; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nba; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*m) + 1; + if (notrna) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (awrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (awrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &a[k1 + l1 * a_dim1], lda, wnrm); + } + } + } + bwrk = nbb + nba; + i__1 = nbb; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*n) + 1; + i__2 = nbb; + for (l = k; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + if (notrnb) { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[k + (bwrk + l) * swork_dim1] = zlange_("I", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } else { + i__3 = k2 - k1; + i__4 = l2 - l1; + swork[l + (bwrk + k) * swork_dim1] = zlange_("1", &i__3, & + i__4, &b[k1 + l1 * b_dim1], ldb, wnrm); + } + } + } + + sgn = (doublereal) (*isgn); + z__1.r = sgn, z__1.i = 0.; + csgn.r = z__1.r, csgn.i = z__1.i; + + if (notrna && notrnb) { + +/* Solve A*X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-left corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* M L-1 */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */ +/* I=K+1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + i__1 = nbb; + for (l = 1; l <= i__1; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + for (i__ = k - 1; i__ >= 1; --i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L ). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__2 = nbb; + for (j = l + 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && notrnb) { + +/* Solve A**H *X + ISGN*X*B = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* upper-left corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 L-1 */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */ +/* I=1 J=1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + + i__3 = k2 - k1; + i__4 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + } + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__3 = k2 - k1; + i__4 = l2 - l1; + xnrm = zlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__3 = nba; + for (i__ = k + 1; i__ <= i__3; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__4 = i__ * nb; + i2 = f2cmin(i__4,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = i2 - i1; + i__5 = l2 - l1; + cnrm = zlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = i2 - i1; + zdscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__4 = i2 - i1; + i__5 = l2 - l1; + i__6 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__4, &i__5, &i__6, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__3 = nbb; + for (j = l + 1; j <= i__3; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__4 = j * nb; + j2 = f2cmin(i__4,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__4 = k2 - k1; + i__5 = j2 - j1; + cnrm = zlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__4 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__4); + i__4 = nbb; + for (jj = 1; jj <= i__4; ++jj) { + i__5 = nba; + for (ll = 1; ll <= i__5; ++ll) { +/* Computing MIN */ + i__6 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__6); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__4 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__4); + i__4 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__4); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = l2 - 1; + for (ll = l1; ll <= i__4; ++ll) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__4 = j2 - 1; + for (jj = j1; jj <= i__4; ++jj) { + i__5 = k2 - k1; + zdscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__4 = k2 - k1; + i__5 = j2 - j1; + i__6 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "N", &i__4, &i__5, &i__6, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (! notrna && ! notrnb) { + +/* Solve A**H *X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* top-right corner column by column by */ + +/* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* K-1 N */ +/* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__2 = l * nb; + l2 = f2cmin(i__2,*n) + 1; + + i__2 = k2 - k1; + i__3 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + } + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__2 = k2 - k1; + i__3 = l2 - l1; + xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__2 = nba; + for (i__ = k + 1; i__ <= i__2; ++i__) { + +/* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__3 = i__ * nb; + i2 = f2cmin(i__3,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = i2 - i1; + i__4 = l2 - l1; + cnrm = zlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = i2 - i1; + zdscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__3 = i2 - i1; + i__4 = l2 - l1; + i__5 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("C", "N", &i__3, &i__4, &i__5, &z__1, &a[k1 + i1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + } + + i__2 = l - 1; + for (j = 1; j <= i__2; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__3 = j * nb; + j2 = f2cmin(i__3,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__3 = k2 - k1; + i__4 = j2 - j1; + cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__3 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__3); + i__3 = nbb; + for (jj = 1; jj <= i__3; ++jj) { + i__4 = nba; + for (ll = 1; ll <= i__4; ++ll) { +/* Computing MIN */ + i__5 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__5); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__3 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__3); + i__3 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__3); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__3 = j2 - 1; + for (jj = j1; jj <= i__3; ++jj) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__3 = k2 - k1; + i__4 = j2 - j1; + i__5 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + } else if (notrna && ! notrnb) { + +/* Solve A*X + ISGN*X*B**H = scale*C. */ + +/* The (K,L)th block of X is determined starting from */ +/* bottom-right corner column by column by */ + +/* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */ + +/* Where */ +/* M N */ +/* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */ +/* I=K+1 J=L+1 */ + +/* Start loop over block rows (index = K) and block columns (index = L) */ + + for (k = nba; k >= 1; --k) { + +/* K1: row index of the first row in X( K, L ) */ +/* K2: row index of the first row in X( K+1, L ) */ +/* so the K2 - K1 is the column count of the block X( K, L ) */ + + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__1 = k * nb; + k2 = f2cmin(i__1,*m) + 1; + for (l = nbb; l >= 1; --l) { + +/* L1: column index of the first column in X( K, L ) */ +/* L2: column index of the first column in X( K, L + 1) */ +/* so that L2 - L1 is the row count of the block X( K, L ) */ + + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__1 = l * nb; + l2 = f2cmin(i__1,*n) + 1; + + i__1 = k2 - k1; + i__2 = l2 - l1; + ztrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1] + , lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * + c_dim1], ldc, &scaloc, &iinfo); + *info = f2cmax(*info,iinfo); + + if (scaloc * swork[k + l * swork_dim1] == 0.) { + if (scaloc == 0.) { +/* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */ +/* is larger than the product of BIGNUM**2 and cannot be */ +/* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */ +/* Mark the computation as pointless. */ + buf = 0.; + } else { +/* Use second scaling factor to prevent flushing to zero. */ + i__1 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__1); + } + i__1 = nbb; + for (jj = 1; jj <= i__1; ++jj) { + i__2 = nba; + for (ll = 1; ll <= i__2; ++ll) { +/* Bound by BIGNUM to not introduce Inf. The value */ +/* is irrelevant; corresponding entries of the */ +/* solution will be flushed in consistency scaling. */ +/* Computing MIN */ + i__3 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] + / pow_di(&c_b18, &i__3); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + } + swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1] + ; + i__1 = k2 - k1; + i__2 = l2 - l1; + xnrm = zlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc, + wnrm); + + i__1 = k - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */ + + i1 = (i__ - 1) * nb + 1; +/* Computing MIN */ + i__2 = i__ * nb; + i2 = f2cmin(i__2,*m) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = i2 - i1; + i__3 = l2 - l1; + cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[i__ + l * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + anrm = swork[i__ + (awrk + k) * swork_dim1]; + scaloc = dlarmm_(&anrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( I, L ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], & + c__1); + } + } + + scal = scamin / swork[i__ + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (ll = l1; ll <= i__2; ++ll) { + i__3 = i2 - i1; + zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[i__ + l * swork_dim1] = scamin * scaloc; + + i__2 = i2 - i1; + i__3 = l2 - l1; + i__4 = k2 - k1; + z__1.r = -1., z__1.i = 0.; + zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * + a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, + &c__[i1 + l1 * c_dim1], ldc) + ; + + } + + i__1 = l - 1; + for (j = 1; j <= i__1; ++j) { + +/* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */ + + j1 = (j - 1) * nb + 1; +/* Computing MIN */ + i__2 = j * nb; + j2 = f2cmin(i__2,*n) + 1; + +/* Compute scaling factor to survive the linear update */ +/* simulating consistent scaling. */ + + i__2 = k2 - k1; + i__3 = j2 - j1; + cnrm = zlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], + ldc, wnrm); +/* Computing MIN */ + d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * + swork_dim1]; + scamin = f2cmin(d__1,d__2); + cnrm *= scamin / swork[k + j * swork_dim1]; + xnrm *= scamin / swork[k + l * swork_dim1]; + bnrm = swork[l + (bwrk + j) * swork_dim1]; + scaloc = dlarmm_(&bnrm, &xnrm, &cnrm); + if (scaloc * scamin == 0.) { +/* Use second scaling factor to prevent flushing to zero. */ + i__2 = myexp_(&scaloc); + buf *= pow_di(&c_b18, &i__2); + i__2 = nbb; + for (jj = 1; jj <= i__2; ++jj) { + i__3 = nba; + for (ll = 1; ll <= i__3; ++ll) { +/* Computing MIN */ + i__4 = myexp_(&scaloc); + d__1 = bignum, d__2 = swork[ll + jj * + swork_dim1] / pow_di(&c_b18, &i__4); + swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2); + } + } + i__2 = myexp_(&scaloc); + scamin /= pow_di(&c_b18, &i__2); + i__2 = myexp_(&scaloc); + scaloc /= pow_di(&c_b18, &i__2); + } + cnrm *= scaloc; + xnrm *= scaloc; + +/* Simultaneously apply the robust update factor and the */ +/* consistency scaling factor to C( K, J ) and C( K, L). */ + + scal = scamin / swork[k + l * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = l2 - 1; + for (jj = l1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + + scal = scamin / swork[k + j * swork_dim1] * scaloc; + if (scal != 1.) { + i__2 = j2 - 1; + for (jj = j1; jj <= i__2; ++jj) { + i__3 = k2 - k1; + zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], & + c__1); + } + } + +/* Record current scaling factor */ + + swork[k + l * swork_dim1] = scamin * scaloc; + swork[k + j * swork_dim1] = scamin * scaloc; + + i__2 = k2 - k1; + i__3 = j2 - j1; + i__4 = l2 - l1; + z__1.r = -csgn.r, z__1.i = -csgn.i; + zgemm_("N", "C", &i__2, &i__3, &i__4, &z__1, &c__[k1 + l1 + * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, + &c__[k1 + j1 * c_dim1], ldc) + ; + } + } + } + + } + + free(wnrm); + +/* Reduce local scaling factors */ + + *scale = swork[swork_dim1 + 1]; + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { +/* Computing MIN */ + d__1 = *scale, d__2 = swork[k + l * swork_dim1]; + *scale = f2cmin(d__1,d__2); + } + } + if (*scale == 0.) { + +/* The magnitude of the largest entry of the solution is larger */ +/* than the product of BIGNUM**2 and cannot be represented in the */ +/* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */ +/* zero and give up. */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + return; + } + +/* Realize consistent scaling */ + + i__1 = nba; + for (k = 1; k <= i__1; ++k) { + k1 = (k - 1) * nb + 1; +/* Computing MIN */ + i__2 = k * nb; + k2 = f2cmin(i__2,*m) + 1; + i__2 = nbb; + for (l = 1; l <= i__2; ++l) { + l1 = (l - 1) * nb + 1; +/* Computing MIN */ + i__3 = l * nb; + l2 = f2cmin(i__3,*n) + 1; + scal = *scale / swork[k + l * swork_dim1]; + if (scal != 1.) { + i__3 = l2 - 1; + for (ll = l1; ll <= i__3; ++ll) { + i__4 = k2 - k1; + zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1); + } + } + } + } + + if (buf != 1. && buf > 0.) { + +/* Decrease SCALE as much as possible. */ + +/* Computing MIN */ + d__1 = *scale / smlnum, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + *scale /= scaloc; + } + + if (buf != 1. && buf > 0.) { + +/* In case of overly aggressive scaling during the computation, */ +/* flushing of the global scale factor may be prevented by */ +/* undoing some of the scaling. This step is to ensure that */ +/* this routine flushes only scale factors that TRSYL also */ +/* flushes and be usable as a drop-in replacement. */ + +/* How much can the normwise largest entry be upscaled? */ + +/* Computing MAX */ + i__1 = c_dim1 + 1; + d__3 = (d__1 = c__[i__1].r, abs(d__1)), d__4 = (d__2 = d_imag(&c__[ + c_dim1 + 1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = *n; + for (l = 1; l <= i__2; ++l) { +/* Computing MAX */ + i__3 = k + l * c_dim1; + d__3 = scal, d__4 = (d__1 = c__[i__3].r, abs(d__1)), d__3 = + f2cmax(d__3,d__4), d__4 = (d__2 = d_imag(&c__[k + l * + c_dim1]), abs(d__2)); + scal = f2cmax(d__3,d__4); + } + } + +/* Increase BUF as close to 1 as possible and apply scaling. */ + +/* Computing MIN */ + d__1 = bignum / scal, d__2 = 1. / buf; + scaloc = f2cmin(d__1,d__2); + buf *= scaloc; + zlascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], + ldc, &iinfo); + } + +/* Combine with buffer scaling factor. SCALE will be flushed if */ +/* BUF is less than one here. */ + + *scale *= buf; + +/* Restore workspace dimensions */ + + swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb); + swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba); + + return; + +/* End of ZTRSYL3 */ + +} /* ztrsyl3_ */ + diff --git a/lapack-netlib/SRC/ztrsyl3.f b/lapack-netlib/SRC/ztrsyl3.f new file mode 100644 index 0000000000..b5a058da4e --- /dev/null +++ b/lapack-netlib/SRC/ztrsyl3.f @@ -0,0 +1,1142 @@ +*> \brief \b ZTRSYL3 +* +* Definition: +* =========== +* +* +*> \par Purpose +* ============= +*> +*> \verbatim +*> +*> ZTRSYL3 solves the complex Sylvester matrix equation: +*> +*> op(A)*X + X*op(B) = scale*C or +*> op(A)*X - X*op(B) = scale*C, +*> +*> where op(A) = A or A**H, and A and B are both upper triangular. A is +*> M-by-M and B is N-by-N; the right hand side C and the solution X are +*> M-by-N; and scale is an output scale factor, set <= 1 to avoid +*> overflow in X. +*> +*> This is the block version of the algorithm. +*> \endverbatim +* +* Arguments +* ========= +* +*> \param[in] TRANA +*> \verbatim +*> TRANA is CHARACTER*1 +*> Specifies the option op(A): +*> = 'N': op(A) = A (No transpose) +*> = 'C': op(A) = A**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] TRANB +*> \verbatim +*> TRANB is CHARACTER*1 +*> Specifies the option op(B): +*> = 'N': op(B) = B (No transpose) +*> = 'C': op(B) = B**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] ISGN +*> \verbatim +*> ISGN is INTEGER +*> Specifies the sign in the equation: +*> = +1: solve op(A)*X + X*op(B) = scale*C +*> = -1: solve op(A)*X - X*op(B) = scale*C +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The order of the matrix A, and the number of rows in the +*> matrices X and C. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the matrix B, and the number of columns in the +*> matrices X and C. N >= 0. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,M) +*> The upper triangular matrix A. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> The upper triangular matrix B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,N). +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N right hand side matrix C. +*> On exit, C is overwritten by the solution matrix X. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M) +*> \endverbatim +*> +*> \param[out] SCALE +*> \verbatim +*> SCALE is DOUBLE PRECISION +*> The scale factor, scale, set <= 1 to avoid overflow in X. +*> \endverbatim +*> +*> \param[out] SWORK +*> \verbatim +*> SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), +*> MAX(1,COLS)). +*> On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS +*> and SWORK(2) returns the optimal COLS. +*> \endverbatim +*> +*> \param[in] LDSWORK +*> \verbatim +*> LDSWORK is INTEGER +*> LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) +*> and NB is the optimal block size. +*> +*> If LDSWORK = -1, then a workspace query is assumed; the routine +*> only calculates the optimal dimensions of the SWORK matrix, +*> returns these values as the first and second entry of the SWORK +*> matrix, and no error message related LWORK is issued by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> = 1: A and B have common or very close eigenvalues; perturbed +*> values were used to solve the equation (but the matrices +*> A and B are unchanged). +*> \endverbatim +* +*> \ingroup complex16SYcomputational +* +* ===================================================================== +* References: +* E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of +* algorithms: The triangular Sylvester equation, ACM Transactions +* on Mathematical Software (TOMS), volume 29, pages 218--243. +* +* A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel +* Solution of the Triangular Sylvester Equation. Lecture Notes in +* Computer Science, vol 12043, pages 82--92, Springer. +* +* Contributor: +* Angelika Schwarz, Umea University, Sweden. +* +* ===================================================================== + SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C, + $ LDC, SCALE, SWORK, LDSWORK, INFO ) + IMPLICIT NONE +* +* .. Scalar Arguments .. + CHARACTER TRANA, TRANB + INTEGER INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N + DOUBLE PRECISION SCALE +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) + DOUBLE PRECISION SWORK( LDSWORK, * ) +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRNA, NOTRNB, LQUERY + INTEGER AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ, + $ K, K1, K2, L, L1, L2, LL, NBA, NB, NBB + DOUBLE PRECISION ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC, + $ SCAMIN, SGN, XNRM, BUF, SMLNUM + COMPLEX*16 CSGN +* .. +* .. Local Arrays .. + DOUBLE PRECISION WNRM( MAX( M, N ) ) +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + DOUBLE PRECISION DLAMCH, DLARMM, ZLANGE + EXTERNAL DLAMCH, DLARMM, ILAENV, LSAME, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DIMAG, EXPONENT, MAX, MIN +* .. +* .. Executable Statements .. +* +* Decode and Test input parameters +* + NOTRNA = LSAME( TRANA, 'N' ) + NOTRNB = LSAME( TRANB, 'N' ) +* +* Use the same block size for all matrices. +* + NB = MAX( 8, ILAENV( 1, 'ZTRSYL', '', M, N, -1, -1) ) +* +* Compute number of blocks in A and B +* + NBA = MAX( 1, (M + NB - 1) / NB ) + NBB = MAX( 1, (N + NB - 1) / NB ) +* +* Compute workspace +* + INFO = 0 + LQUERY = ( LDSWORK.EQ.-1 ) + IF( LQUERY ) THEN + LDSWORK = 2 + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + END IF +* +* Test the input arguments +* + IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN + INFO = -2 + ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN + INFO = -3 + ELSE IF( M.LT.0 ) THEN + INFO = -4 + ELSE IF( N.LT.0 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -9 + ELSE IF( LDC.LT.MAX( 1, M ) ) THEN + INFO = -11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRSYL3', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + SCALE = ONE + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Use unblocked code for small problems or if insufficient +* workspace is provided +* + IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, + $ C, LDC, SCALE, INFO ) + RETURN + END IF +* +* Set constants to control overflow +* + SMLNUM = DLAMCH( 'S' ) + BIGNUM = ONE / SMLNUM +* +* Set local scaling factors. +* + DO L = 1, NBB + DO K = 1, NBA + SWORK( K, L ) = ONE + END DO + END DO +* +* Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. +* This scaling is to ensure compatibility with TRSYL and may get flushed. +* + BUF = ONE +* +* Compute upper bounds of blocks of A and B +* + AWRK = NBB + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = K, NBA + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, M ) + 1 + IF( NOTRNA ) THEN + SWORK( K, AWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + ELSE + SWORK( L, AWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ A( K1, L1 ), LDA, WNRM ) + END IF + END DO + END DO + BWRK = NBB + NBA + DO K = 1, NBB + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, N ) + 1 + DO L = K, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + IF( NOTRNB ) THEN + SWORK( K, BWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + ELSE + SWORK( L, BWRK + K ) = ZLANGE( '1', K2-K1, L2-L1, + $ B( K1, L1 ), LDB, WNRM ) + END IF + END DO + END DO +* + SGN = DBLE( ISGN ) + CSGN = DCMPLX( SGN, ZERO ) +* + IF( NOTRNA .AND. NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-left corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* M L-1 +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. +* I=K+1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K - 1, 1, -1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L ). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK(L, BWRK + J) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B = scale*C. +* +* The (K,L)th block of X is determined starting from +* upper-left corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) +* +* Where +* K-1 L-1 +* R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] +* I=1 J=1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = L + 1, NBB +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( L1, J1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A**H *X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* top-right corner column by column by +* +* A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* K-1 N +* R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = 1, NBA +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = K + 1, NBA +* +* C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( K1, I1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO + ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN +* +* Solve A*X + ISGN*X*B**H = scale*C. +* +* The (K,L)th block of X is determined starting from +* bottom-right corner column by column by +* +* A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) +* +* Where +* M N +* R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. +* I=K+1 J=L+1 +* +* Start loop over block rows (index = K) and block columns (index = L) +* + DO K = NBA, 1, -1 +* +* K1: row index of the first row in X( K, L ) +* K2: row index of the first row in X( K+1, L ) +* so the K2 - K1 is the column count of the block X( K, L ) +* + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = NBB, 1, -1 +* +* L1: column index of the first column in X( K, L ) +* L2: column index of the first column in X( K, L + 1) +* so that L2 - L1 is the row count of the block X( K, L ) +* + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 +* + CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1, + $ A( K1, K1 ), LDA, + $ B( L1, L1 ), LDB, + $ C( K1, L1 ), LDC, SCALOC, IINFO ) + INFO = MAX( INFO, IINFO ) +* + IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN + IF( SCALOC .EQ. ZERO ) THEN +* The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) +* is larger than the product of BIGNUM**2 and cannot be +* represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). +* Mark the computation as pointless. + BUF = ZERO + ELSE +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + END IF + DO JJ = 1, NBB + DO LL = 1, NBA +* Bound by BIGNUM to not introduce Inf. The value +* is irrelevant; corresponding entries of the +* solution will be flushed in consistency scaling. + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + END IF + SWORK( K, L ) = SCALOC * SWORK( K, L ) + XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC, + $ WNRM ) +* + DO I = 1, K - 1 +* +* C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) +* + I1 = (I - 1) * NB + 1 + I2 = MIN( I * NB, M ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( I, L ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + ANRM = SWORK( I, AWRK + K ) + SCALOC = DLARMM( ANRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( I, L ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( I, L ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE, + $ A( I1, K1 ), LDA, C( K1, L1 ), LDC, + $ CONE, C( I1, L1 ), LDC ) +* + END DO +* + DO J = 1, L - 1 +* +* C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H +* + J1 = (J - 1) * NB + 1 + J2 = MIN( J * NB, N ) + 1 +* +* Compute scaling factor to survive the linear update +* simulating consistent scaling. +* + CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ), + $ LDC, WNRM ) + SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) ) + CNRM = CNRM * ( SCAMIN / SWORK( K, J ) ) + XNRM = XNRM * ( SCAMIN / SWORK( K, L ) ) + BNRM = SWORK( L, BWRK + J ) + SCALOC = DLARMM( BNRM, XNRM, CNRM ) + IF( SCALOC * SCAMIN .EQ. ZERO ) THEN +* Use second scaling factor to prevent flushing to zero. + BUF = BUF*2.D0**EXPONENT( SCALOC ) + DO JJ = 1, NBB + DO LL = 1, NBA + SWORK( LL, JJ ) = MIN( BIGNUM, + $ SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) ) + END DO + END DO + SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC ) + SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC ) + END IF + CNRM = CNRM * SCALOC + XNRM = XNRM * SCALOC +* +* Simultaneously apply the robust update factor and the +* consistency scaling factor to C( K, J ) and C( K, L). +* + SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* + SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC + IF( SCAL .NE. ONE ) THEN + DO JJ = J1, J2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 ) + END DO + ENDIF +* +* Record current scaling factor +* + SWORK( K, L ) = SCAMIN * SCALOC + SWORK( K, J ) = SCAMIN * SCALOC +* + CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN, + $ C( K1, L1 ), LDC, B( J1, L1 ), LDB, + $ CONE, C( K1, J1 ), LDC ) + END DO + END DO + END DO +* + END IF +* +* Reduce local scaling factors +* + SCALE = SWORK( 1, 1 ) + DO K = 1, NBA + DO L = 1, NBB + SCALE = MIN( SCALE, SWORK( K, L ) ) + END DO + END DO + IF( SCALE .EQ. ZERO ) THEN +* +* The magnitude of the largest entry of the solution is larger +* than the product of BIGNUM**2 and cannot be represented in the +* form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to +* zero and give up. +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA + RETURN + END IF +* +* Realize consistent scaling +* + DO K = 1, NBA + K1 = (K - 1) * NB + 1 + K2 = MIN( K * NB, M ) + 1 + DO L = 1, NBB + L1 = (L - 1) * NB + 1 + L2 = MIN( L * NB, N ) + 1 + SCAL = SCALE / SWORK( K, L ) + IF( SCAL .NE. ONE ) THEN + DO LL = L1, L2-1 + CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 ) + END DO + ENDIF + END DO + END DO +* + IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN +* +* Decrease SCALE as much as possible. +* + SCALOC = MIN( SCALE / SMLNUM, ONE / BUF ) + BUF = BUF * SCALOC + SCALE = SCALE / SCALOC + END IF +* + IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN +* +* In case of overly aggressive scaling during the computation, +* flushing of the global scale factor may be prevented by +* undoing some of the scaling. This step is to ensure that +* this routine flushes only scale factors that TRSYL also +* flushes and be usable as a drop-in replacement. +* +* How much can the normwise largest entry be upscaled? +* + SCAL = MAX( ABS( DBLE( C( 1, 1 ) ) ), + $ ABS( DIMAG( C ( 1, 1 ) ) ) ) + DO K = 1, M + DO L = 1, N + SCAL = MAX( SCAL, ABS( DBLE ( C( K, L ) ) ), + $ ABS( DIMAG ( C( K, L ) ) ) ) + END DO + END DO +* +* Increase BUF as close to 1 as possible and apply scaling. +* + SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) + BUF = BUF * SCALOC + CALL ZLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO ) + END IF +* +* Combine with buffer scaling factor. SCALE will be flushed if +* BUF is less than one here. +* + SCALE = SCALE * BUF +* +* Restore workspace dimensions +* + SWORK(1,1) = MAX( NBA, NBB ) + SWORK(2,1) = 2 * NBB + NBA +* + RETURN +* +* End of ZTRSYL3 +* + END diff --git a/lapack-netlib/SRC/ztrti2.c b/lapack-netlib/SRC/ztrti2.c index a710e1d7be..dbd3db68ee 100644 --- a/lapack-netlib/SRC/ztrti2.c +++ b/lapack-netlib/SRC/ztrti2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrti2_(char *uplo, char *diag, integer *n, +/* Subroutine */ void ztrti2_(char *uplo, char *diag, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -634,11 +634,12 @@ f"> */ /* Local variables */ integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); logical upper; - extern /* Subroutine */ int ztrmv_(char *, char *, char *, integer *, - doublecomplex *, integer *, doublecomplex *, integer *), xerbla_(char *, integer *, ftnlen); + extern /* Subroutine */ void ztrmv_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; doublecomplex ajj; @@ -675,7 +676,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRTI2", &i__1, (ftnlen)6); - return 0; + return; } if (upper) { @@ -735,7 +736,7 @@ f"> */ } } - return 0; + return; /* End of ZTRTI2 */ diff --git a/lapack-netlib/SRC/ztrtri.c b/lapack-netlib/SRC/ztrtri.c index 2c93106348..be2f6acbb4 100644 --- a/lapack-netlib/SRC/ztrtri.c +++ b/lapack-netlib/SRC/ztrtri.c @@ -625,7 +625,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrtri_(char *uplo, char *diag, integer *n, +/* Subroutine */ void ztrtri_(char *uplo, char *diag, integer *n, doublecomplex *a, integer *lda, integer *info) { /* System generated locals */ @@ -638,16 +638,16 @@ f"> */ integer j; extern logical lsame_(char *, char *); logical upper; - extern /* Subroutine */ int ztrmm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrmm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer jb, nb, nn; - extern /* Subroutine */ int ztrti2_(char *, char *, integer *, - doublecomplex *, integer *, integer *), xerbla_( - char *, integer *, ftnlen); + extern /* Subroutine */ void ztrti2_(char *, char *, integer *, + doublecomplex *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); logical nounit; @@ -685,13 +685,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRTRI", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity if non-unit. */ @@ -701,7 +701,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (a[i__2].r == 0. && a[i__2].i == 0.) { - return 0; + return; } /* L10: */ } @@ -785,7 +785,7 @@ f"> */ } } - return 0; + return; /* End of ZTRTRI */ diff --git a/lapack-netlib/SRC/ztrtrs.c b/lapack-netlib/SRC/ztrtrs.c index 19058a3ea8..6d1f71a043 100644 --- a/lapack-netlib/SRC/ztrtrs.c +++ b/lapack-netlib/SRC/ztrtrs.c @@ -652,7 +652,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrtrs_(char *uplo, char *trans, char *diag, integer *n, +/* Subroutine */ void ztrtrs_(char *uplo, char *trans, char *diag, integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *info) { @@ -661,10 +661,10 @@ f"> */ /* Local variables */ extern logical lsame_(char *, char *); - extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, + extern /* Subroutine */ void ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, - doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical nounit; @@ -709,13 +709,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRTRS", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Check for singularity. */ @@ -725,7 +725,7 @@ f"> */ for (*info = 1; *info <= i__1; ++(*info)) { i__2 = *info + *info * a_dim1; if (a[i__2].r == 0. && a[i__2].i == 0.) { - return 0; + return; } /* L10: */ } @@ -737,7 +737,7 @@ f"> */ ztrsm_("Left", uplo, trans, diag, n, nrhs, &c_b2, &a[a_offset], lda, &b[ b_offset], ldb); - return 0; + return; /* End of ZTRTRS */ diff --git a/lapack-netlib/SRC/ztrttf.c b/lapack-netlib/SRC/ztrttf.c index 453fe099dc..692e5e8450 100644 --- a/lapack-netlib/SRC/ztrttf.c +++ b/lapack-netlib/SRC/ztrttf.c @@ -726,7 +726,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztrttf_(char *transr, char *uplo, integer *n, +/* Subroutine */ void ztrttf_(char *transr, char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *arf, integer *info) { /* System generated locals */ @@ -776,7 +776,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRTTF", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ @@ -790,7 +790,7 @@ f"> */ arf[0].r = z__1.r, arf[0].i = z__1.i; } } - return 0; + return; } /* Size of array ARF(1:2,0:nt-1) */ @@ -1122,7 +1122,7 @@ f"> */ } - return 0; + return; /* End of ZTRTTF */ diff --git a/lapack-netlib/SRC/ztrttp.c b/lapack-netlib/SRC/ztrttp.c index 8416e609e9..95624b1dba 100644 --- a/lapack-netlib/SRC/ztrttp.c +++ b/lapack-netlib/SRC/ztrttp.c @@ -614,7 +614,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int ztrttp_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void ztrttp_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *ap, integer *info) { /* System generated locals */ @@ -657,7 +657,7 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTRTTP", &i__1, (ftnlen)6); - return 0; + return; } if (lower) { @@ -687,7 +687,7 @@ f"> */ } - return 0; + return; /* End of ZTRTTP */ diff --git a/lapack-netlib/SRC/ztzrzf.c b/lapack-netlib/SRC/ztzrzf.c index 453c18ea16..6d03d2d545 100644 --- a/lapack-netlib/SRC/ztzrzf.c +++ b/lapack-netlib/SRC/ztzrzf.c @@ -667,7 +667,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int ztzrzf_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void ztzrzf_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -680,13 +680,13 @@ f"> */ extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); integer lwkmin, ldwork; - extern /* Subroutine */ int zlarzb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarzb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int zlarzt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarzt_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zlatrz_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *); @@ -745,15 +745,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZTZRZF", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0) { - return 0; + return; } else if (*m == *n) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { @@ -761,7 +761,7 @@ f"> */ tau[i__2].r = 0., tau[i__2].i = 0.; /* L10: */ } - return 0; + return; } nbmin = 2; @@ -859,7 +859,7 @@ f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZTZRZF */ diff --git a/lapack-netlib/SRC/zunbdb.c b/lapack-netlib/SRC/zunbdb.c index 3ccb43733a..5d9d3541af 100644 --- a/lapack-netlib/SRC/zunbdb.c +++ b/lapack-netlib/SRC/zunbdb.c @@ -798,7 +798,7 @@ f"> */ /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunbdb_(char *trans, char *signs, integer *m, integer *p, +/* Subroutine */ void zunbdb_(char *trans, char *signs, integer *m, integer *p, integer *q, doublecomplex *x11, integer *ldx11, doublecomplex *x12, integer *ldx12, doublecomplex *x21, integer *ldx21, doublecomplex * x22, integer *ldx22, doublereal *theta, doublereal *phi, @@ -816,18 +816,19 @@ f"> */ logical colmajor; integer lworkmin, lworkopt, i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); doublereal z1, z2, z3, z4; - extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, + extern /* Subroutine */ void zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical lquery; - extern /* Subroutine */ int zlarfgp_(integer *, doublecomplex *, + extern /* Subroutine */ void zlarfgp_(integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -933,9 +934,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("xORBDB", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Handle column-major and row-major separately */ @@ -1427,7 +1428,7 @@ f"> */ } - return 0; + return; /* End of ZUNBDB */ diff --git a/lapack-netlib/SRC/zunbdb1.c b/lapack-netlib/SRC/zunbdb1.c index 73378a73a4..1759c26021 100644 --- a/lapack-netlib/SRC/zunbdb1.c +++ b/lapack-netlib/SRC/zunbdb1.c @@ -715,7 +715,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunbdb1_(integer *m, integer *p, integer *q, +/* Subroutine */ void zunbdb1_(integer *m, integer *p, integer *q, doublecomplex *x11, integer *ldx11, doublecomplex *x21, integer * ldx21, doublereal *theta, doublereal *phi, doublecomplex *taup1, doublecomplex *taup2, doublecomplex *tauq1, doublecomplex *work, @@ -733,17 +733,18 @@ static integer c__1 = 1; integer i__; doublereal s; integer ilarf, llarf, childinfo; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int zunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void zunbdb5_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zlarfgp_(integer *, @@ -817,9 +818,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZUNBDB1", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., Q of X11 and X21 */ @@ -899,7 +900,7 @@ static integer c__1 = 1; } - return 0; + return; /* End of ZUNBDB1 */ diff --git a/lapack-netlib/SRC/zunbdb2.c b/lapack-netlib/SRC/zunbdb2.c index 0277fe21b4..0b0f9e6cdb 100644 --- a/lapack-netlib/SRC/zunbdb2.c +++ b/lapack-netlib/SRC/zunbdb2.c @@ -714,7 +714,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunbdb2_(integer *m, integer *p, integer *q, +/* Subroutine */ void zunbdb2_(integer *m, integer *p, integer *q, doublecomplex *x11, integer *ldx11, doublecomplex *x21, integer * ldx21, doublereal *theta, doublereal *phi, doublecomplex *taup1, doublecomplex *taup2, doublecomplex *tauq1, doublecomplex *work, @@ -732,20 +732,21 @@ static integer c__1 = 1; integer i__; doublereal s; integer ilarf, llarf; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer childinfo; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int zunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void zunbdb5_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zlarfgp_(integer *, @@ -818,9 +819,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZUNBDB2", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., P of X11 and X21 */ @@ -916,7 +917,7 @@ static integer c__1 = 1; x21[i__ + (i__ + 1) * x21_dim1], ldx21, &work[ilarf]); } - return 0; + return; /* End of ZUNBDB2 */ diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f index 412d8d8d08..46b08aa1ed 100644 --- a/lapack-netlib/SRC/zunbdb2.f +++ b/lapack-netlib/SRC/zunbdb2.f @@ -122,14 +122,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX*16 array, dimension (P) +*> TAUP1 is COMPLEX*16 array, dimension (P-1) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX*16 array, dimension (M-P) +*> TAUP2 is COMPLEX*16 array, dimension (Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/zunbdb3.c b/lapack-netlib/SRC/zunbdb3.c index 2dab207a51..4c6354a74e 100644 --- a/lapack-netlib/SRC/zunbdb3.c +++ b/lapack-netlib/SRC/zunbdb3.c @@ -713,7 +713,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunbdb3_(integer *m, integer *p, integer *q, +/* Subroutine */ void zunbdb3_(integer *m, integer *p, integer *q, doublecomplex *x11, integer *ldx11, doublecomplex *x21, integer * ldx21, doublereal *theta, doublereal *phi, doublecomplex *taup1, doublecomplex *taup2, doublecomplex *tauq1, doublecomplex *work, @@ -731,17 +731,18 @@ static integer c__1 = 1; integer i__; doublereal s; integer ilarf, llarf, childinfo; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int zunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void zunbdb5_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zlarfgp_(integer *, @@ -814,9 +815,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZUNBDB3", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce rows 1, ..., M-P of X11 and X21 */ @@ -911,7 +912,7 @@ static integer c__1 = 1; x11[i__ + (i__ + 1) * x11_dim1], ldx11, &work[ilarf]); } - return 0; + return; /* End of ZUNBDB3 */ diff --git a/lapack-netlib/SRC/zunbdb4.c b/lapack-netlib/SRC/zunbdb4.c index cdb8d7c0ca..db8bb894e9 100644 --- a/lapack-netlib/SRC/zunbdb4.c +++ b/lapack-netlib/SRC/zunbdb4.c @@ -725,7 +725,7 @@ static integer c__1 = 1; /* > Algorithms, 50(1):33-65, 2009. */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunbdb4_(integer *m, integer *p, integer *q, +/* Subroutine */ void zunbdb4_(integer *m, integer *p, integer *q, doublecomplex *x11, integer *ldx11, doublecomplex *x21, integer * ldx21, doublereal *theta, doublereal *phi, doublecomplex *taup1, doublecomplex *taup2, doublecomplex *tauq1, doublecomplex *phantom, @@ -743,20 +743,21 @@ static integer c__1 = 1; integer i__, j; doublereal s; integer ilarf, llarf; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer childinfo; - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *), zdrot_(integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *); extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical lquery; integer iorbdb5, lorbdb5; - extern /* Subroutine */ int zunbdb5_(integer *, integer *, integer *, + extern /* Subroutine */ void zunbdb5_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zlarfgp_(integer *, @@ -832,9 +833,9 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZUNBDB4", &i__1, (ftnlen)7); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Reduce columns 1, ..., M-Q of X11 and X21 */ @@ -986,7 +987,7 @@ static integer c__1 = 1; zlacgv_(&i__2, &x21[*m - *q + i__ - *p + i__ * x21_dim1], ldx21); } - return 0; + return; /* End of ZUNBDB4 */ diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f index b1fcd8bd03..4672cfa67b 100644 --- a/lapack-netlib/SRC/zunbdb4.f +++ b/lapack-netlib/SRC/zunbdb4.f @@ -124,14 +124,14 @@ *> *> \param[out] TAUP1 *> \verbatim -*> TAUP1 is COMPLEX*16 array, dimension (P) +*> TAUP1 is COMPLEX*16 array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P1. *> \endverbatim *> *> \param[out] TAUP2 *> \verbatim -*> TAUP2 is COMPLEX*16 array, dimension (M-P) +*> TAUP2 is COMPLEX*16 array, dimension (M-Q) *> The scalar factors of the elementary reflectors that define *> P2. *> \endverbatim diff --git a/lapack-netlib/SRC/zunbdb5.c b/lapack-netlib/SRC/zunbdb5.c index f0fd4fbdf0..c92452ad27 100644 --- a/lapack-netlib/SRC/zunbdb5.c +++ b/lapack-netlib/SRC/zunbdb5.c @@ -664,7 +664,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunbdb5_(integer *m1, integer *m2, integer *n, +/* Subroutine */ void zunbdb5_(integer *m1, integer *m2, integer *n, doublecomplex *x1, integer *incx1, doublecomplex *x2, integer *incx2, doublecomplex *q1, integer *ldq1, doublecomplex *q2, integer *ldq2, doublecomplex *work, integer *lwork, integer *info) @@ -676,7 +676,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Local variables */ integer i__, j, childinfo; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zunbdb6_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zunbdb6_( integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *) @@ -728,7 +729,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNBDB5", &i__1, (ftnlen)7); - return 0; + return; } /* Project X onto the orthogonal complement of Q */ @@ -741,7 +742,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ d__1 = dznrm2_(m1, &x1[1], incx1); d__2 = dznrm2_(m2, &x2[1], incx2); if (d__1 != 0. || d__2 != 0.) { - return 0; + return; } /* Project each standard basis vector e_1,...,e_M1 in turn, stopping */ @@ -766,7 +767,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ d__1 = dznrm2_(m1, &x1[1], incx1); d__2 = dznrm2_(m2, &x2[1], incx2); if (d__1 != 0. || d__2 != 0.) { - return 0; + return; } } @@ -792,11 +793,11 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ d__1 = dznrm2_(m1, &x1[1], incx1); d__2 = dznrm2_(m2, &x2[1], incx2); if (d__1 != 0. || d__2 != 0.) { - return 0; + return; } } - return 0; + return; /* End of ZUNBDB5 */ diff --git a/lapack-netlib/SRC/zunbdb6.c b/lapack-netlib/SRC/zunbdb6.c index a41ccd3412..e68eb562b2 100644 --- a/lapack-netlib/SRC/zunbdb6.c +++ b/lapack-netlib/SRC/zunbdb6.c @@ -669,7 +669,7 @@ static integer c__1 = 1; /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunbdb6_(integer *m1, integer *m2, integer *n, +/* Subroutine */ void zunbdb6_(integer *m1, integer *m2, integer *n, doublecomplex *x1, integer *incx1, doublecomplex *x2, integer *incx2, doublecomplex *q1, integer *ldq1, doublecomplex *q2, integer *ldq2, doublecomplex *work, integer *lwork, integer *info) @@ -680,10 +680,11 @@ static integer c__1 = 1; /* Local variables */ integer i__; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, - integer *, doublecomplex *, doublecomplex *, integer *), - xerbla_(char *, integer *, ftnlen), zlassq_(integer *, + integer *, doublecomplex *, doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlassq_(integer *, doublecomplex *, integer *, doublereal *, doublereal *); doublereal normsq1, normsq2, scl1, scl2, ssq1, ssq2; @@ -733,7 +734,7 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); xerbla_("ZUNBDB6", &i__1, (ftnlen)7); - return 0; + return; } /* First, project X onto the orthogonal complement of Q's column */ @@ -787,11 +788,11 @@ static integer c__1 = 1; /* Otherwise, project again. */ if (normsq2 >= normsq1 * .01) { - return 0; + return; } if (normsq2 == 0.) { - return 0; + return; } normsq1 = normsq2; @@ -850,7 +851,7 @@ static integer c__1 = 1; } } - return 0; + return; /* End of ZUNBDB6 */ diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f index ec681b597a..ed666e449b 100644 --- a/lapack-netlib/SRC/zunbdb6.f +++ b/lapack-netlib/SRC/zunbdb6.f @@ -41,10 +41,16 @@ *> with respect to the columns of *> Q = [ Q1 ] . *> [ Q2 ] -*> The columns of Q must be orthonormal. +*> The Euclidean norm of X must be one and the columns of Q must be +*> orthonormal. The orthogonalized vector will be zero if and only if it +*> lies entirely in the range of Q. *> -*> If the projection is zero according to Kahan's "twice is enough" -*> criterion, then the zero vector is returned. +*> The projection is computed with at most two iterations of the +*> classical Gram-Schmidt algorithm, see +*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error +*> analysis of the Gram-Schmidt algorithm with reorthogonalization." +*> 2002. CERFACS Technical Report No. TR/PA/02/33. URL: +*> https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf *> *>\endverbatim * @@ -167,16 +173,19 @@ SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ALPHASQ, REALONE, REALZERO - PARAMETER ( ALPHASQ = 0.01D0, REALONE = 1.0D0, + DOUBLE PRECISION ALPHA, REALONE, REALZERO + PARAMETER ( ALPHA = 0.01D0, REALONE = 1.0D0, $ REALZERO = 0.0D0 ) COMPLEX*16 NEGONE, ONE, ZERO PARAMETER ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0), $ ZERO = (0.0D0,0.0D0) ) * .. * .. Local Scalars .. - INTEGER I - DOUBLE PRECISION NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2 + INTEGER I, IX + DOUBLE PRECISION EPS, NORM, NORM_NEW, SCL, SSQ +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH * .. * .. External Subroutines .. EXTERNAL ZGEMV, ZLASSQ, XERBLA @@ -211,17 +220,17 @@ SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL XERBLA( 'ZUNBDB6', -INFO ) RETURN END IF +* + EPS = DLAMCH( 'Precision' ) * * First, project X onto the orthogonal complement of Q's column * space * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2 +* Christoph Conrads: In debugging mode the norm should be computed +* and an assertion added comparing the norm with one. Alas, Fortran +* never made it into 1989 when assert() was introduced into the C +* programming language. + NORM = REALONE * IF( M1 .EQ. 0 ) THEN DO I = 1, N @@ -239,27 +248,31 @@ SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If projection is sufficiently large in norm, then stop. * If projection is zero, then stop. * Otherwise, project again. * - IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN + IF( NORM_NEW .GE. ALPHA * NORM ) THEN RETURN END IF * - IF( NORMSQ2 .EQ. ZERO ) THEN + IF( NORM_NEW .LE. N * EPS * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1( IX ) = ZERO + END DO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2( IX ) = ZERO + END DO RETURN END IF * - NORMSQ1 = NORMSQ2 + NORM = NORM_NEW * DO I = 1, N WORK(I) = ZERO @@ -281,24 +294,22 @@ SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2, $ INCX2 ) * - SCL1 = REALZERO - SSQ1 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - SCL2 = REALZERO - SSQ2 = REALONE - CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 ) - NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2 + SCL = REALZERO + SSQ = REALZERO + CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ ) + CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ ) + NORM_NEW = SCL * SQRT(SSQ) * * If second projection is sufficiently large in norm, then do * nothing more. Alternatively, if it shrunk significantly, then * truncate it to zero. * - IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN - DO I = 1, M1 - X1(I) = ZERO + IF( NORM_NEW .LT. ALPHA * NORM ) THEN + DO IX = 1, 1 + (M1-1)*INCX1, INCX1 + X1(IX) = ZERO END DO - DO I = 1, M2 - X2(I) = ZERO + DO IX = 1, 1 + (M2-1)*INCX2, INCX2 + X2(IX) = ZERO END DO END IF * @@ -307,4 +318,3 @@ SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, * End of ZUNBDB6 * END - diff --git a/lapack-netlib/SRC/zuncsd.c b/lapack-netlib/SRC/zuncsd.c index b487a0f8ba..2e8cef7993 100644 --- a/lapack-netlib/SRC/zuncsd.c +++ b/lapack-netlib/SRC/zuncsd.c @@ -829,7 +829,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zuncsd_(char *jobu1, char *jobu2, char *jobv1t, char * +/* Subroutine */ void zuncsd_(char *jobu1, char *jobu2, char *jobv1t, char * jobv2t, char *trans, char *signs, integer *m, integer *p, integer *q, doublecomplex *x11, integer *ldx11, doublecomplex *x12, integer * ldx12, doublecomplex *x21, integer *ldx21, doublecomplex *x22, @@ -856,7 +856,7 @@ f"> */ lorbdbworkmin, lrworkmin, lbbcsdworkopt; logical wantu1, wantu2; integer lrworkopt, ibbcsd, lorbdbworkopt, iorbdb, lorglqworkmin; - extern /* Subroutine */ int zbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void zbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, @@ -866,29 +866,29 @@ f"> */ integer lorgqrworkmin; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer lorglqworkopt; - extern /* Subroutine */ int zunbdb_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunbdb_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, integer *); integer lorgqrworkopt, iorglq; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer iorgqr; - extern /* Subroutine */ int zlapmr_(logical *, integer *, integer *, + extern /* Subroutine */ void zlapmr_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); char signst[1]; - extern /* Subroutine */ int zlapmt_(logical *, integer *, integer *, + extern /* Subroutine */ void zlapmt_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); char transt[1]; integer lbbcsdwork; logical lquery; - extern /* Subroutine */ int zunglq_(integer *, integer *, integer *, + extern /* Subroutine */ void zunglq_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer lorbdbwork; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); integer lorglqwork, lorgqrwork; @@ -1017,7 +1017,7 @@ f"> */ ldv1t, &v2t[v2t_offset], ldv2t, &u1[u1_offset], ldu1, &u2[ u2_offset], ldu2, &work[1], lwork, &rwork[1], lrwork, &iwork[ 1], info); - return 0; + return; } /* Work with permutation [ 0 I; I 0 ] * X * [ 0 I; I 0 ] if */ @@ -1037,7 +1037,7 @@ f"> */ u2_offset], ldu2, &u1[u1_offset], ldu1, &v2t[v2t_offset], ldv2t, &v1t[v1t_offset], ldv1t, &work[1], lwork, &rwork[1], lrwork, &iwork[1], info); - return 0; + return; } /* Compute workspace */ @@ -1153,9 +1153,9 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNCSD", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery || lrquery) { - return 0; + return; } /* Transform to bidiagonal block form */ @@ -1334,7 +1334,7 @@ f"> */ } } - return 0; + return; /* End ZUNCSD */ diff --git a/lapack-netlib/SRC/zuncsd2by1.c b/lapack-netlib/SRC/zuncsd2by1.c index 82e996435d..23b894d9ac 100644 --- a/lapack-netlib/SRC/zuncsd2by1.c +++ b/lapack-netlib/SRC/zuncsd2by1.c @@ -766,7 +766,7 @@ by1.f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zuncsd2by1_(char *jobu1, char *jobu2, char *jobv1t, +/* Subroutine */ void zuncsd2by1_(char *jobu1, char *jobu2, char *jobv1t, integer *m, integer *p, integer *q, doublecomplex *x11, integer * ldx11, doublecomplex *x21, integer *ldx21, doublereal *theta, doublecomplex *u1, integer *ldu1, doublecomplex *u2, integer *ldu2, @@ -784,30 +784,31 @@ by1.f"> */ integer iphi, lworkmin, lworkopt, i__, j, r__; extern logical lsame_(char *, char *); integer childinfo; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lorglqmin, lorgqrmin, lorglqopt, lrworkmin, itaup1, itaup2, itauq1, lorgqropt; logical wantu1, wantu2; integer lrworkopt, ibbcsd, lbbcsd, iorbdb, lorbdb; - extern /* Subroutine */ int zbbcsd_(char *, char *, char *, char *, char * + extern /* Subroutine */ void zbbcsd_(char *, char *, char *, char *, char * , integer *, integer *, integer *, doublereal *, doublereal *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *, integer *, integer *), xerbla_(char *, integer *, ftnlen); + doublereal *, integer *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer iorglq, lorglq; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer iorgqr; - extern /* Subroutine */ int zlapmr_(logical *, integer *, integer *, + extern /* Subroutine */ void zlapmr_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); integer lorgqr; - extern /* Subroutine */ int zlapmt_(logical *, integer *, integer *, + extern /* Subroutine */ void zlapmt_(logical *, integer *, integer *, doublecomplex *, integer *, integer *); logical lquery; - extern /* Subroutine */ int zunglq_(integer *, integer *, integer *, + extern /* Subroutine */ void zunglq_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -1155,9 +1156,9 @@ by1.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNCSD2BY1", &i__1, (ftnlen)10); - return 0; + return; } else if (lquery) { - return 0; + return; } lorgqr = *lwork - iorgqr + 1; lorglq = *lwork - iorglq + 1; @@ -1481,7 +1482,7 @@ by1.f"> */ } } - return 0; + return; /* End of ZUNCSD2BY1 */ diff --git a/lapack-netlib/SRC/zung2l.c b/lapack-netlib/SRC/zung2l.c index a0ecf4be4d..fccecb341d 100644 --- a/lapack-netlib/SRC/zung2l.c +++ b/lapack-netlib/SRC/zung2l.c @@ -628,7 +628,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zung2l_(integer *m, integer *n, integer *k, +/* Subroutine */ void zung2l_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *info) { @@ -638,7 +638,7 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); @@ -678,13 +678,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNG2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns 1:n-k to columns of the unit matrix */ @@ -733,7 +733,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of ZUNG2L */ diff --git a/lapack-netlib/SRC/zung2r.c b/lapack-netlib/SRC/zung2r.c index ea2914908e..75d48a91ae 100644 --- a/lapack-netlib/SRC/zung2r.c +++ b/lapack-netlib/SRC/zung2r.c @@ -627,7 +627,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zung2r_(integer *m, integer *n, integer *k, +/* Subroutine */ void zung2r_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *info) { @@ -637,10 +637,11 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen); + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -675,13 +676,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNG2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } /* Initialise columns k+1:n to columns of the unit matrix */ @@ -732,7 +733,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of ZUNG2R */ diff --git a/lapack-netlib/SRC/zungbr.c b/lapack-netlib/SRC/zungbr.c index 61b335d636..7033dc2bf8 100644 --- a/lapack-netlib/SRC/zungbr.c +++ b/lapack-netlib/SRC/zungbr.c @@ -670,7 +670,7 @@ f"> */ /* > \ingroup complex16GBcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungbr_(char *vect, integer *m, integer *n, integer *k, +/* Subroutine */ void zungbr_(char *vect, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -686,7 +686,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zunglq_(integer *, integer *, integer *, + extern /* Subroutine */ void zunglq_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -767,17 +767,17 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (wantq) { @@ -884,7 +884,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNGBR */ diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f index 3dfca43be2..c42a372c5b 100644 --- a/lapack-netlib/SRC/zungbr.f +++ b/lapack-netlib/SRC/zungbr.f @@ -233,7 +233,7 @@ SUBROUTINE ZUNGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF END IF END IF - LWKOPT = DBLE( WORK( 1 ) ) + LWKOPT = INT( DBLE( WORK( 1 ) ) ) LWKOPT = MAX (LWKOPT, MN) END IF * diff --git a/lapack-netlib/SRC/zunghr.c b/lapack-netlib/SRC/zunghr.c index 817acaa715..b0b60567de 100644 --- a/lapack-netlib/SRC/zunghr.c +++ b/lapack-netlib/SRC/zunghr.c @@ -640,7 +640,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunghr_(integer *n, integer *ilo, integer *ihi, +/* Subroutine */ void zunghr_(integer *n, integer *ilo, integer *ihi, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -654,7 +654,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, + extern /* Subroutine */ void zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *); @@ -703,16 +703,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGHR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Shift the vectors which define the elementary reflectors one */ @@ -775,7 +775,7 @@ f"> */ ilo], &work[1], lwork, &iinfo); } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNGHR */ diff --git a/lapack-netlib/SRC/zungl2.c b/lapack-netlib/SRC/zungl2.c index d870af12cd..ea0fe66fc1 100644 --- a/lapack-netlib/SRC/zungl2.c +++ b/lapack-netlib/SRC/zungl2.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungl2_(integer *m, integer *n, integer *k, +/* Subroutine */ void zungl2_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *info) { @@ -633,10 +633,12 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *, doublecomplex *), xerbla_(char *, integer *, ftnlen), zlacgv_(integer *, doublecomplex *, integer *); + integer *, doublecomplex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_(integer *, doublecomplex *, integer *); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -671,13 +673,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGL2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -738,7 +740,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of ZUNGL2 */ diff --git a/lapack-netlib/SRC/zunglq.c b/lapack-netlib/SRC/zunglq.c index 67e5f46493..2075177a72 100644 --- a/lapack-netlib/SRC/zunglq.c +++ b/lapack-netlib/SRC/zunglq.c @@ -643,7 +643,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunglq_(integer *m, integer *n, integer *k, +/* Subroutine */ void zunglq_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -652,19 +652,19 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo, ib, nb; - extern /* Subroutine */ int zungl2_(integer *, integer *, integer *, + extern /* Subroutine */ void zungl2_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer ki, kk, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical lquery; @@ -709,16 +709,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -839,7 +839,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZUNGLQ */ diff --git a/lapack-netlib/SRC/zungql.c b/lapack-netlib/SRC/zungql.c index 39d990d4e5..0d5fa891f9 100644 --- a/lapack-netlib/SRC/zungql.c +++ b/lapack-netlib/SRC/zungql.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungql_(integer *m, integer *n, integer *k, +/* Subroutine */ void zungql_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -653,19 +653,19 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo, ib, nb; - extern /* Subroutine */ int zung2l_(integer *, integer *, integer *, + extern /* Subroutine */ void zung2l_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer kk, nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); logical lquery; @@ -721,15 +721,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { - return 0; + return; } nbmin = 2; @@ -849,7 +849,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZUNGQL */ diff --git a/lapack-netlib/SRC/zungqr.c b/lapack-netlib/SRC/zungqr.c index 35f8d06c56..74c6b3cf69 100644 --- a/lapack-netlib/SRC/zungqr.c +++ b/lapack-netlib/SRC/zungqr.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungqr_(integer *m, integer *n, integer *k, +/* Subroutine */ void zungqr_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -653,19 +653,19 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo, ib, nb, ki, kk; - extern /* Subroutine */ int zung2r_(integer *, integer *, integer *, + extern /* Subroutine */ void zung2r_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -711,16 +711,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n <= 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -841,7 +841,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZUNGQR */ diff --git a/lapack-netlib/SRC/zungr2.c b/lapack-netlib/SRC/zungr2.c index 980b674463..e527607f91 100644 --- a/lapack-netlib/SRC/zungr2.c +++ b/lapack-netlib/SRC/zungr2.c @@ -624,7 +624,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungr2_(integer *m, integer *n, integer *k, +/* Subroutine */ void zungr2_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *info) { @@ -634,12 +634,13 @@ f"> */ /* Local variables */ integer i__, j, l; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer ii; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); @@ -675,13 +676,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } if (*k < *m) { @@ -740,7 +741,7 @@ f"> */ } /* L40: */ } - return 0; + return; /* End of ZUNGR2 */ diff --git a/lapack-netlib/SRC/zungrq.c b/lapack-netlib/SRC/zungrq.c index 66d296605a..03f8b67a9b 100644 --- a/lapack-netlib/SRC/zungrq.c +++ b/lapack-netlib/SRC/zungrq.c @@ -644,7 +644,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungrq_(integer *m, integer *n, integer *k, +/* Subroutine */ void zungrq_(integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex * work, integer *lwork, integer *info) { @@ -653,19 +653,19 @@ f"> */ /* Local variables */ integer i__, j, l, nbmin, iinfo, ib, nb, ii, kk; - extern /* Subroutine */ int zungr2_(integer *, integer *, integer *, + extern /* Subroutine */ void zungr2_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer nx; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -722,15 +722,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m <= 0) { - return 0; + return; } nbmin = 2; @@ -850,7 +850,7 @@ f"> */ } work[1].r = (doublereal) iws, work[1].i = 0.; - return 0; + return; /* End of ZUNGRQ */ diff --git a/lapack-netlib/SRC/zungtr.c b/lapack-netlib/SRC/zungtr.c index b9ba88865e..d42c84c75c 100644 --- a/lapack-netlib/SRC/zungtr.c +++ b/lapack-netlib/SRC/zungtr.c @@ -637,7 +637,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zungtr_(char *uplo, integer *n, doublecomplex *a, +/* Subroutine */ void zungtr_(char *uplo, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *work, integer *lwork, integer *info) { @@ -655,7 +655,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zungql_(integer *, integer *, integer *, + extern /* Subroutine */ void zungql_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, integer *), zungqr_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, @@ -721,16 +721,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGTR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (upper) { @@ -811,7 +811,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNGTR */ diff --git a/lapack-netlib/SRC/zungtsqr.c b/lapack-netlib/SRC/zungtsqr.c index 5e8c70b37e..5b49a37ceb 100644 --- a/lapack-netlib/SRC/zungtsqr.c +++ b/lapack-netlib/SRC/zungtsqr.c @@ -688,7 +688,7 @@ r.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zungtsqr_(integer *m, integer *n, integer *mb, integer * +/* Subroutine */ void zungtsqr_(integer *m, integer *n, integer *mb, integer * nb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *lwork, integer *info) { @@ -697,15 +697,16 @@ r.f"> */ doublecomplex z__1; /* Local variables */ - extern /* Subroutine */ int zlamtsqr_(char *, char *, integer *, integer * + extern /* Subroutine */ void zlamtsqr_(char *, char *, integer *, integer * , integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer lworkopt, j, iinfo; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lc, lw; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlaset_( char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); logical lquery; @@ -787,11 +788,11 @@ r.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGTSQR", &i__1, (ftnlen)8); - return 0; + return; } else if (lquery) { z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* Quick return if possible */ @@ -799,7 +800,7 @@ r.f"> */ if (f2cmin(*m,*n) == 0) { z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in */ @@ -834,7 +835,7 @@ r.f"> */ z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZUNGTSQR */ diff --git a/lapack-netlib/SRC/zungtsqr_row.c b/lapack-netlib/SRC/zungtsqr_row.c index dbbcdc9338..d23eac0ccf 100644 --- a/lapack-netlib/SRC/zungtsqr_row.c +++ b/lapack-netlib/SRC/zungtsqr_row.c @@ -701,7 +701,7 @@ qr_row.f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zungtsqr_row_(integer *m, integer *n, integer *mb, +/* Subroutine */ void zungtsqr_row_(integer *m, integer *n, integer *mb, integer *nb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *work, integer *lwork, integer *info) { @@ -715,12 +715,12 @@ qr_row.f"> */ integer ib_bottom__, ib, kb; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer mb1, mb2; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer m_plus_one__; logical lquery; integer num_all_row_blocks__, imb, knb, nblocal, kb_last__; - extern /* Subroutine */ int zlarfb_gett_(char *, integer *, integer *, + extern /* Subroutine */ void zlarfb_gett_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); @@ -782,11 +782,11 @@ qr_row.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNGTSQR_ROW", &i__1, (ftnlen)12); - return 0; + return; } else if (lquery) { z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* Quick return if possible */ @@ -794,7 +794,7 @@ qr_row.f"> */ if (f2cmin(*m,*n) == 0) { z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; } /* (0) Set the upper-triangular part of the matrix A to zero and */ @@ -917,7 +917,7 @@ qr_row.f"> */ z__1.r = (doublereal) lworkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZUNGTSQR_ROW */ diff --git a/lapack-netlib/SRC/zunhr_col.c b/lapack-netlib/SRC/zunhr_col.c index fc0cdb3f63..935a0c7693 100644 --- a/lapack-netlib/SRC/zunhr_col.c +++ b/lapack-netlib/SRC/zunhr_col.c @@ -772,7 +772,7 @@ ol.f"> */ /* > \endverbatim */ /* ===================================================================== */ -/* Subroutine */ int zunhr_col_(integer *m, integer *n, integer *nb, +/* Subroutine */ void zunhr_col_(integer *m, integer *n, integer *nb, doublecomplex *a, integer *lda, doublecomplex *t, integer *ldt, doublecomplex *d__, integer *info) { @@ -781,10 +781,10 @@ ol.f"> */ doublecomplex z__1; /* Local variables */ - extern /* Subroutine */ int zlaunhr_col_getrfnp_(integer *, integer *, + extern /* Subroutine */ void zlaunhr_col_getrfnp_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer nplusone, i__, j, iinfo; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), ztrsm_(char *, char *, char *, char *, integer *, integer *, doublecomplex *, @@ -837,13 +837,13 @@ ol.f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNHR_COL", &i__1, (ftnlen)9); - return 0; + return; } /* Quick return if possible */ if (f2cmin(*m,*n) == 0) { - return 0; + return; } /* On input, the M-by-N matrix A contains the unitary */ @@ -975,7 +975,7 @@ ol.f"> */ } - return 0; + return; /* End of ZUNHR_COL */ diff --git a/lapack-netlib/SRC/zunm22.c b/lapack-netlib/SRC/zunm22.c index 8fd3646e2e..1d07b56329 100644 --- a/lapack-netlib/SRC/zunm22.c +++ b/lapack-netlib/SRC/zunm22.c @@ -674,7 +674,7 @@ f"> */ /* > \ingroup complexOTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunm22_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunm22_(char *side, char *trans, integer *m, integer *n, integer *n1, integer *n2, doublecomplex *q, integer *ldq, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -687,7 +687,7 @@ f"> */ logical left; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), ztrmm_(char *, char *, char *, char *, @@ -697,7 +697,7 @@ f"> */ extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical notran; integer ldwork; - extern /* Subroutine */ int zlacpy_(char *, integer *, integer *, + extern /* Subroutine */ void zlacpy_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer lwkopt; logical lquery; @@ -774,16 +774,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNM22", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Degenerate cases (N1 = 0 or N2 = 0) are handled using ZTRMM. */ @@ -792,12 +792,12 @@ f"> */ ztrmm_(side, "Upper", trans, "Non-Unit", m, n, &c_b1, &q[q_offset], ldq, &c__[c_offset], ldc); work[1].r = 1., work[1].i = 0.; - return 0; + return; } else if (*n2 == 0) { ztrmm_(side, "Lower", trans, "Non-Unit", m, n, &c_b1, &q[q_offset], ldq, &c__[c_offset], ldc); work[1].r = 1., work[1].i = 0.; - return 0; + return; } /* Compute the largest chunk size available from the workspace. */ @@ -982,7 +982,7 @@ f"> */ z__1.r = (doublereal) lwkopt, z__1.i = 0.; work[1].r = z__1.r, work[1].i = z__1.i; - return 0; + return; /* End of ZUNM22 */ diff --git a/lapack-netlib/SRC/zunm2l.c b/lapack-netlib/SRC/zunm2l.c index 959cc0b5d9..9748d4d213 100644 --- a/lapack-netlib/SRC/zunm2l.c +++ b/lapack-netlib/SRC/zunm2l.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunm2l_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunm2l_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) { @@ -685,7 +685,7 @@ f"> */ doublecomplex taui; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i1, i2, i3, mi, ni, nq; @@ -745,13 +745,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNM2L", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -804,7 +804,7 @@ f"> */ a[i__3].r = aii.r, a[i__3].i = aii.i; /* L10: */ } - return 0; + return; /* End of ZUNM2L */ diff --git a/lapack-netlib/SRC/zunm2r.c b/lapack-netlib/SRC/zunm2r.c index 1a20bed0be..9e4e90ecb6 100644 --- a/lapack-netlib/SRC/zunm2r.c +++ b/lapack-netlib/SRC/zunm2r.c @@ -672,7 +672,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunm2r_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunm2r_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) { @@ -685,7 +685,7 @@ f"> */ doublecomplex taui; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i1, i2, i3, ic, jc, mi, ni, nq; @@ -745,13 +745,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNM2R", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -808,7 +808,7 @@ f"> */ a[i__3].r = aii.r, a[i__3].i = aii.i; /* L10: */ } - return 0; + return; /* End of ZUNM2R */ diff --git a/lapack-netlib/SRC/zunmbr.c b/lapack-netlib/SRC/zunmbr.c index 283af09921..5fb8cbe051 100644 --- a/lapack-netlib/SRC/zunmbr.c +++ b/lapack-netlib/SRC/zunmbr.c @@ -710,7 +710,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmbr_(char *vect, char *side, char *trans, integer *m, +/* Subroutine */ void zunmbr_(char *vect, char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer * lwork, integer *info) @@ -731,7 +731,7 @@ f"> */ char transt[1]; integer lwkopt; logical lquery; - extern /* Subroutine */ int zunmlq_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmlq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -857,15 +857,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMBR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (applyq) { @@ -934,7 +934,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMBR */ diff --git a/lapack-netlib/SRC/zunmhr.c b/lapack-netlib/SRC/zunmhr.c index 9680b23750..76e8aad0c9 100644 --- a/lapack-netlib/SRC/zunmhr.c +++ b/lapack-netlib/SRC/zunmhr.c @@ -692,7 +692,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmhr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmhr_(char *side, char *trans, integer *m, integer *n, integer *ilo, integer *ihi, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex * work, integer *lwork, integer *info) @@ -711,7 +711,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zunmqr_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); @@ -796,16 +796,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("ZUNMHR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nh == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (left) { @@ -824,7 +824,7 @@ f"> */ tau[*ilo], &c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMHR */ diff --git a/lapack-netlib/SRC/zunml2.c b/lapack-netlib/SRC/zunml2.c index a33d19b089..e4d071d292 100644 --- a/lapack-netlib/SRC/zunml2.c +++ b/lapack-netlib/SRC/zunml2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunml2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunml2_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) { @@ -681,11 +681,12 @@ f"> */ doublecomplex taui; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i1, i2, i3, ic, jc, mi, ni, nq; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical notran; doublecomplex aii; @@ -742,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNML2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && notran || ! left && ! notran) { @@ -813,7 +814,7 @@ f"> */ } /* L10: */ } - return 0; + return; /* End of ZUNML2 */ diff --git a/lapack-netlib/SRC/zunmlq.c b/lapack-netlib/SRC/zunmlq.c index fa902c43eb..b9e8e02626 100644 --- a/lapack-netlib/SRC/zunmlq.c +++ b/lapack-netlib/SRC/zunmlq.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmlq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmlq_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -698,20 +698,20 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ int zunml2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunml2_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nq, nw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical notran; integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); char transt[1]; @@ -793,16 +793,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMLQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -892,7 +892,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMLQ */ diff --git a/lapack-netlib/SRC/zunmql.c b/lapack-netlib/SRC/zunmql.c index 35b4b80e11..da7c16c612 100644 --- a/lapack-netlib/SRC/zunmql.c +++ b/lapack-netlib/SRC/zunmql.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmql_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmql_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -698,20 +698,20 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, nb, mi, ni; - extern /* Subroutine */ int zunm2l_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunm2l_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nq, nw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical notran; integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -796,15 +796,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMQL", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -884,7 +884,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMQL */ diff --git a/lapack-netlib/SRC/zunmqr.c b/lapack-netlib/SRC/zunmqr.c index f7e866bff0..b3470b9c1d 100644 --- a/lapack-netlib/SRC/zunmqr.c +++ b/lapack-netlib/SRC/zunmqr.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmqr_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmqr_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -698,20 +698,20 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, ic, jc, nb, mi, ni; - extern /* Subroutine */ int zunm2r_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunm2r_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nq, nw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical notran; integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer lwkopt; @@ -792,16 +792,16 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMQR", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } nbmin = 2; @@ -885,7 +885,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMQR */ diff --git a/lapack-netlib/SRC/zunmr2.c b/lapack-netlib/SRC/zunmr2.c index d2cadca74c..cf0cd013d4 100644 --- a/lapack-netlib/SRC/zunmr2.c +++ b/lapack-netlib/SRC/zunmr2.c @@ -668,7 +668,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmr2_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmr2_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) { @@ -681,11 +681,12 @@ f"> */ doublecomplex taui; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i1, i2, i3, mi, ni, nq; - extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *); logical notran; doublecomplex aii; @@ -742,13 +743,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMR2", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -805,7 +806,7 @@ f"> */ zlacgv_(&i__3, &a[i__ + a_dim1], lda); /* L10: */ } - return 0; + return; /* End of ZUNMR2 */ diff --git a/lapack-netlib/SRC/zunmr3.c b/lapack-netlib/SRC/zunmr3.c index 804fde3f02..0ca68e2a52 100644 --- a/lapack-netlib/SRC/zunmr3.c +++ b/lapack-netlib/SRC/zunmr3.c @@ -687,7 +687,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunmr3_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmr3_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer * info) @@ -702,7 +702,7 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer i1, i2, i3; - extern /* Subroutine */ int zlarz_(char *, integer *, integer *, integer * + extern /* Subroutine */ void zlarz_(char *, integer *, integer *, integer * , doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer ja, ic, jc, mi, ni, nq; @@ -763,13 +763,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMR3", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || *k == 0) { - return 0; + return; } if (left && ! notran || ! left && notran) { @@ -824,7 +824,7 @@ f"> */ /* L10: */ } - return 0; + return; /* End of ZUNMR3 */ diff --git a/lapack-netlib/SRC/zunmrq.c b/lapack-netlib/SRC/zunmrq.c index 028688862f..30ccf40df2 100644 --- a/lapack-netlib/SRC/zunmrq.c +++ b/lapack-netlib/SRC/zunmrq.c @@ -682,7 +682,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmrq_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmrq_(char *side, char *trans, integer *m, integer *n, integer *k, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -698,20 +698,20 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, nb, mi, ni; - extern /* Subroutine */ int zunmr2_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmr2_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nq, nw; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int zlarfb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarfb_(char *, char *, char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical notran; integer ldwork; - extern /* Subroutine */ int zlarft_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarft_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); char transt[1]; @@ -797,15 +797,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMRQ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } nbmin = 2; @@ -891,7 +891,7 @@ f"> */ } } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMRQ */ diff --git a/lapack-netlib/SRC/zunmrz.c b/lapack-netlib/SRC/zunmrz.c index 301307fd48..728db22d0e 100644 --- a/lapack-netlib/SRC/zunmrz.c +++ b/lapack-netlib/SRC/zunmrz.c @@ -702,7 +702,7 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zunmrz_(char *side, char *trans, integer *m, integer *n, +/* Subroutine */ void zunmrz_(char *side, char *trans, integer *m, integer *n, integer *k, integer *l, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer * lwork, integer *info) @@ -718,7 +718,7 @@ f"> */ integer i__; extern logical lsame_(char *, char *); integer nbmin, iinfo, i1, i2, i3, ib, ic, ja, jc, nb, mi, ni; - extern /* Subroutine */ int zunmr3_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmr3_(char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *); integer nq, nw; @@ -727,14 +727,14 @@ f"> */ integer *, integer *, ftnlen, ftnlen); logical notran; integer ldwork; - extern /* Subroutine */ int zlarzb_(char *, char *, char *, char *, + extern /* Subroutine */ void zlarzb_(char *, char *, char *, char *, integer *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); char transt[1]; integer lwkopt; logical lquery; - extern /* Subroutine */ int zlarzt_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zlarzt_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer iwt; @@ -819,15 +819,15 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUNMRZ", &i__1, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Determine the block size. NB may be at most NBMAX, where NBMAX */ @@ -933,7 +933,7 @@ f"> */ work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMRZ */ diff --git a/lapack-netlib/SRC/zunmtr.c b/lapack-netlib/SRC/zunmtr.c index 74d5182f9c..55b93d3348 100644 --- a/lapack-netlib/SRC/zunmtr.c +++ b/lapack-netlib/SRC/zunmtr.c @@ -685,7 +685,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zunmtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void zunmtr_(char *side, char *uplo, char *trans, integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *lwork, integer *info) @@ -706,7 +706,7 @@ f"> */ integer *, integer *, ftnlen, ftnlen); integer lwkopt; logical lquery; - extern /* Subroutine */ int zunmql_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zunmql_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, @@ -817,16 +817,16 @@ f"> */ if (*info != 0) { i__2 = -(*info); xerbla_("ZUNMTR", &i__2, (ftnlen)6); - return 0; + return; } else if (lquery) { - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0 || nq == 1) { work[1].r = 1., work[1].i = 0.; - return 0; + return; } if (left) { @@ -860,7 +860,7 @@ f"> */ c__[i1 + i2 * c_dim1], ldc, &work[1], lwork, &iinfo); } work[1].r = (doublereal) lwkopt, work[1].i = 0.; - return 0; + return; /* End of ZUNMTR */ diff --git a/lapack-netlib/SRC/zupgtr.c b/lapack-netlib/SRC/zupgtr.c index 3daebc3cc1..671de3e58c 100644 --- a/lapack-netlib/SRC/zupgtr.c +++ b/lapack-netlib/SRC/zupgtr.c @@ -623,7 +623,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zupgtr_(char *uplo, integer *n, doublecomplex *ap, +/* Subroutine */ void zupgtr_(char *uplo, integer *n, doublecomplex *ap, doublecomplex *tau, doublecomplex *q, integer *ldq, doublecomplex * work, integer *info) { @@ -635,13 +635,14 @@ f"> */ extern logical lsame_(char *, char *); integer iinfo; logical upper; - extern /* Subroutine */ int zung2l_(integer *, integer *, integer *, + extern /* Subroutine */ void zung2l_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer ij; - extern /* Subroutine */ int zung2r_(integer *, integer *, integer *, + extern /* Subroutine */ void zung2r_(integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, - integer *), xerbla_(char *, integer *, ftnlen); + integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK computational routine (version 3.7.0) -- */ @@ -676,13 +677,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUPGTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*n == 0) { - return 0; + return; } if (upper) { @@ -769,7 +770,7 @@ f"> */ &work[1], &iinfo); } } - return 0; + return; /* End of ZUPGTR */ diff --git a/lapack-netlib/SRC/zupmtr.c b/lapack-netlib/SRC/zupmtr.c index 47cebe548e..2a67a791b7 100644 --- a/lapack-netlib/SRC/zupmtr.c +++ b/lapack-netlib/SRC/zupmtr.c @@ -662,7 +662,7 @@ f"> */ /* > \ingroup complex16OTHERcomputational */ /* ===================================================================== */ -/* Subroutine */ int zupmtr_(char *side, char *uplo, char *trans, integer *m, +/* Subroutine */ void zupmtr_(char *side, char *uplo, char *trans, integer *m, integer *n, doublecomplex *ap, doublecomplex *tau, doublecomplex *c__, integer *ldc, doublecomplex *work, integer *info) { @@ -675,7 +675,7 @@ f"> */ doublecomplex taui; integer i__; extern logical lsame_(char *, char *); - extern /* Subroutine */ int zlarf_(char *, integer *, integer *, + extern /* Subroutine */ void zlarf_(char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *); integer i1; @@ -734,13 +734,13 @@ f"> */ if (*info != 0) { i__1 = -(*info); xerbla_("ZUPMTR", &i__1, (ftnlen)6); - return 0; + return; } /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } if (upper) { @@ -876,7 +876,7 @@ f"> */ /* L20: */ } } - return 0; + return; /* End of ZUPMTR */ diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 226004a90a..d252c7fa9e 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -40,7 +40,7 @@ set(SEIGTST schkee.F sget54.f sglmts.f sgqrts.f sgrqts.f sgsvts3.f shst01.f slarfy.f slarhs.f slatm4.f slctes.f slctsx.f slsets.f sort01.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f - sstt22.f ssyt21.f ssyt22.f) + sstt22.f ssyl01.f ssyt21.f ssyt22.f) set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f @@ -56,7 +56,7 @@ set(CEIGTST cchkee.F cget54.f cglmts.f cgqrts.f cgrqts.f cgsvts3.f chbt21.f chet21.f chet22.f chpt21.f chst01.f clarfy.f clarhs.f clatm4.f clctes.f clctsx.f clsets.f csbmv.f - csgt01.f cslect.f + csgt01.f cslect.f csyl01.f cstt21.f cstt22.f cunt01.f cunt03.f) set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f @@ -77,7 +77,7 @@ set(DEIGTST dchkee.F dget54.f dglmts.f dgqrts.f dgrqts.f dgsvts3.f dhst01.f dlarfy.f dlarhs.f dlatm4.f dlctes.f dlctsx.f dlsets.f dort01.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f - dstt22.f dsyt21.f dsyt22.f) + dstt22.f dsyl01.f dsyt21.f dsyt22.f) set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f @@ -93,7 +93,7 @@ set(ZEIGTST zchkee.F zget54.f zglmts.f zgqrts.f zgrqts.f zgsvts3.f zhbt21.f zhet21.f zhet22.f zhpt21.f zhst01.f zlarfy.f zlarhs.f zlatm4.f zlctes.f zlctsx.f zlsets.f zsbmv.f - zsgt01.f zslect.f + zsgt01.f zslect.f zsyl01.f zstt21.f zstt22.f zunt01.f zunt03.f) macro(add_eig_executable name) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index bccfccf95c..942ae6982c 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -62,7 +62,7 @@ SEIGTST = schkee.o \ sget54.o sglmts.o sgqrts.o sgrqts.o sgsvts3.o \ shst01.o slarfy.o slarhs.o slatm4.o slctes.o slctsx.o slsets.o sort01.o \ sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \ - sstt22.o ssyt21.o ssyt22.o + sstt22.o ssyl01.o ssyt21.o ssyt22.o CEIGTST = cchkee.o \ cbdt01.o cbdt02.o cbdt03.o cbdt05.o \ @@ -78,7 +78,7 @@ CEIGTST = cchkee.o \ cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts3.o \ chbt21.o chet21.o chet22.o chpt21.o chst01.o \ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ - csgt01.o cslect.o \ + csgt01.o cslect.o csyl01.o\ cstt21.o cstt22.o cunt01.o cunt03.o DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \ @@ -99,7 +99,7 @@ DEIGTST = dchkee.o \ dget54.o dglmts.o dgqrts.o dgrqts.o dgsvts3.o \ dhst01.o dlarfy.o dlarhs.o dlatm4.o dlctes.o dlctsx.o dlsets.o dort01.o \ dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \ - dstt22.o dsyt21.o dsyt22.o + dstt22.o dsyl01.o dsyt21.o dsyt22.o ZEIGTST = zchkee.o \ zbdt01.o zbdt02.o zbdt03.o zbdt05.o \ @@ -115,7 +115,7 @@ ZEIGTST = zchkee.o \ zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts3.o \ zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ - zsgt01.o zslect.o \ + zsgt01.o zslect.o zsyl01.o\ zstt21.o zstt22.o zunt01.o zunt03.o .PHONY: all diff --git a/lapack-netlib/TESTING/EIG/cchkec.f b/lapack-netlib/TESTING/EIG/cchkec.f index 6727a0954b..c892b0a54a 100644 --- a/lapack-netlib/TESTING/EIG/cchkec.f +++ b/lapack-netlib/TESTING/EIG/cchkec.f @@ -23,7 +23,7 @@ *> \verbatim *> *> CCHKEC tests eigen- condition estimation routines -*> CTRSYL, CTREXC, CTRSNA, CTRSEN +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA, CTRSEN *> *> In all cases, the routine runs through a fixed set of numerical *> examples, subjects them to various tests, and compares the test @@ -88,17 +88,17 @@ SUBROUTINE CCHKEC( THRESH, TSTERR, NIN, NOUT ) * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - REAL EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + REAL EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38 + EXTERNAL CERREC, CGET35, CGET36, CGET37, CGET38, CSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -120,10 +120,24 @@ SUBROUTINE CCHKEC( THRESH, TSTERR, NIN, NOUT ) $ CALL CERREC( PATH, NOUT ) * OK = .TRUE. - CALL CGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL CGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL CSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL CGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -169,6 +183,12 @@ SUBROUTINE CCHKEC( THRESH, TSTERR, NIN, NOUT ) $ / ' Safe minimum (SFMIN) = ', E16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9972 FORMAT( 'CTRSYL and CTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in CTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in CTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) RETURN * * End of CCHKEC diff --git a/lapack-netlib/TESTING/EIG/cchkhs.f b/lapack-netlib/TESTING/EIG/cchkhs.f index 65f1fc82d4..6c6430d5f8 100644 --- a/lapack-netlib/TESTING/EIG/cchkhs.f +++ b/lapack-netlib/TESTING/EIG/cchkhs.f @@ -21,7 +21,7 @@ * .. Array Arguments .. * LOGICAL DOTYPE( * ), SELECT( * ) * INTEGER ISEED( 4 ), IWORK( * ), NN( * ) -* REAL RESULT( 14 ), RWORK( * ) +* REAL RESULT( 16 ), RWORK( * ) * COMPLEX A( LDA, * ), EVECTL( LDU, * ), * $ EVECTR( LDU, * ), EVECTX( LDU, * ), * $ EVECTY( LDU, * ), H( LDA, * ), T1( LDA, * ), @@ -64,10 +64,15 @@ *> eigenvectors of H. Y is lower triangular, and X is *> upper triangular. *> +*> CTREVC3 computes left and right eigenvector matrices +*> from a Schur matrix T and backtransforms them with Z +*> to eigenvector matrices L and R for A. L and R are +*> GE matrices. +*> *> When CCHKHS is called, a number of matrix "sizes" ("n's") and a *> number of matrix "types" are specified. For each size ("n") *> and each type of matrix, one matrix will be generated and used -*> to test the nonsymmetric eigenroutines. For each matrix, 14 +*> to test the nonsymmetric eigenroutines. For each matrix, 16 *> tests will be performed: *> *> (1) | A - U H U**H | / ( |A| n ulp ) @@ -98,6 +103,10 @@ *> *> (14) | Y**H A - W**H Y | / ( |A| |Y| ulp ) *> +*> (15) | AR - RW | / ( |A| |R| ulp ) +*> +*> (16) | LA - WL | / ( |A| |L| ulp ) +*> *> The "sizes" are specified by an array NN(1:NSIZES); the value of *> each element NN(j) specifies one size. *> The "types" are specified by a logical array DOTYPE( 1:NTYPES ); @@ -331,7 +340,7 @@ *> Workspace. Could be equivalenced to IWORK, but not RWORK. *> Modified. *> -*> RESULT - REAL array, dimension (14) +*> RESULT - REAL array, dimension (16) *> The values computed by the fourteen tests described above. *> The values are currently limited to 1/ulp, to avoid *> overflow. @@ -421,7 +430,7 @@ SUBROUTINE CCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * .. Array Arguments .. LOGICAL DOTYPE( * ), SELECT( * ) INTEGER ISEED( 4 ), IWORK( * ), NN( * ) - REAL RESULT( 14 ), RWORK( * ) + REAL RESULT( 16 ), RWORK( * ) COMPLEX A( LDA, * ), EVECTL( LDU, * ), $ EVECTR( LDU, * ), EVECTX( LDU, * ), $ EVECTY( LDU, * ), H( LDA, * ), T1( LDA, * ), @@ -463,8 +472,8 @@ SUBROUTINE CCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * .. External Subroutines .. EXTERNAL CCOPY, CGEHRD, CGEMM, CGET10, CGET22, CHSEIN, $ CHSEQR, CHST01, CLACPY, CLASET, CLATME, CLATMR, - $ CLATMS, CTREVC, CUNGHR, CUNMHR, SLABAD, SLAFTS, - $ SLASUM, XERBLA + $ CLATMS, CTREVC, CTREVC3, CUNGHR, CUNMHR, + $ SLABAD, SLAFTS, SLASUM, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL, SQRT @@ -1067,6 +1076,66 @@ SUBROUTINE CCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, $ RESULT( 14 ) = DUMMA( 3 )*ANINV END IF * +* Compute Left and Right Eigenvectors of A +* +* Compute a Right eigenvector matrix: +* + NTEST = 15 + RESULT( 15 ) = ULPINV +* + CALL CLACPY( ' ', N, N, UZ, LDU, EVECTR, LDU ) +* + CALL CTREVC3( 'Right', 'Back', SELECT, N, T1, LDA, CDUMMA, + $ LDU, EVECTR, LDU, N, IN, WORK, NWORK, RWORK, + $ N, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'CTREVC3(R,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 15: | AR - RW | / ( |A| |R| ulp ) +* +* (from Schur decomposition) +* + CALL CGET22( 'N', 'N', 'N', N, A, LDA, EVECTR, LDU, W1, + $ WORK, RWORK, DUMMA( 1 ) ) + RESULT( 15 ) = DUMMA( 1 ) + IF( DUMMA( 2 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Right', 'CTREVC3', + $ DUMMA( 2 ), N, JTYPE, IOLDSD + END IF +* +* Compute a Left eigenvector matrix: +* + NTEST = 16 + RESULT( 16 ) = ULPINV +* + CALL CLACPY( ' ', N, N, UZ, LDU, EVECTL, LDU ) +* + CALL CTREVC3( 'Left', 'Back', SELECT, N, T1, LDA, EVECTL, + $ LDU, CDUMMA, LDU, N, IN, WORK, NWORK, RWORK, + $ N, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'CTREVC3(L,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 16: | LA - WL | / ( |A| |L| ulp ) +* +* (from Schur decomposition) +* + CALL CGET22( 'Conj', 'N', 'Conj', N, A, LDA, EVECTL, LDU, + $ W1, WORK, RWORK, DUMMA( 3 ) ) + RESULT( 16 ) = DUMMA( 3 ) + IF( DUMMA( 4 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Left', 'CTREVC3', DUMMA( 4 ), + $ N, JTYPE, IOLDSD + END IF +* * End of Loop -- Check for RESULT(j) > THRESH * 240 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f index a93933a278..d15b39d01f 100644 --- a/lapack-netlib/TESTING/EIG/cdrvsg.f +++ b/lapack-netlib/TESTING/EIG/cdrvsg.f @@ -663,8 +663,8 @@ SUBROUTINE CDRVSG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*SLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*SLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/cerrec.f b/lapack-netlib/TESTING/EIG/cerrec.f index 650ab2b6e6..6e2e1d38a3 100644 --- a/lapack-netlib/TESTING/EIG/cerrec.f +++ b/lapack-netlib/TESTING/EIG/cerrec.f @@ -23,7 +23,7 @@ *> *> CERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> CTRSYL, CTREXC, CTRSNA and CTRSEN. +*> CTRSYL, CTRSYL3, CTREXC, CTRSNA and CTRSEN. *> \endverbatim * * Arguments: @@ -77,12 +77,12 @@ SUBROUTINE CERREC( PATH, NUNIT ) * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - REAL RW( LW ), S( NMAX ), SEP( NMAX ) + REAL RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL + EXTERNAL CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL, CTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ SUBROUTINE CERREC( PATH, NUNIT ) CALL CHKXER( 'CTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test CTRSYL3 +* + SRNAMT = 'CTRSYL3' + INFOT = 1 + CALL CTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test CTREXC * SRNAMT = 'CTREXC' diff --git a/lapack-netlib/TESTING/EIG/cget37.f b/lapack-netlib/TESTING/EIG/cget37.f index c2a6589f32..44d4580d6f 100644 --- a/lapack-netlib/TESTING/EIG/cget37.f +++ b/lapack-netlib/TESTING/EIG/cget37.f @@ -265,7 +265,7 @@ SUBROUTINE CGET37( RMAX, LMAX, NINFO, KNT, NIN ) 100 CONTINUE WSRT( KMIN ) = WSRT( I ) WSRT( I ) = VMIN - VCMIN = WTMP( I ) + VCMIN = REAL( WTMP( I ) ) WTMP( I ) = W( KMIN ) WTMP( KMIN ) = VCMIN VMIN = STMP( KMIN ) diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f index a274681c98..1bd35cb41c 100644 --- a/lapack-netlib/TESTING/EIG/chet21.f +++ b/lapack-netlib/TESTING/EIG/chet21.f @@ -304,9 +304,9 @@ SUBROUTINE CHET21( ITYPE, UPLO, N, KBAND, A, LDA, D, E, U, LDU, V, 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 2, N - 1 + DO 20 J = 1, N - 1 CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, - $ U( 1, J-1 ), 1, WORK, N ) + $ U( 1, J+1 ), 1, WORK, N ) 20 CONTINUE END IF WNORM = CLANHE( '1', CUPLO, N, WORK, N, RWORK ) diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f new file mode 100644 index 0000000000..82d790daa5 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/csyl01.f @@ -0,0 +1,294 @@ +*> \brief \b CSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CSYL01 tests CTRSYL and CTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements CGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual CTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual CTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times CTRSYL3 and CTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of CTRSYL +*> RMAX(2) = Value of the largest test ratio of CTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times CTRSYL where INFO is nonzero +*> NINFO(2) = No. of times CTRSYL3 where INFO is nonzero +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX RMUL +* .. +* .. Local Arrays .. + COMPLEX A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ) + REAL SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, CLANGE + EXTERNAL SISNAN, SLAMCH, CLANGE +* .. +* .. External Subroutines .. + EXTERNAL CLATMR, CLACPY, CGEMM, CTRSYL, CTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.5E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 23 + KLA = 0 + KUA = M - 1 + CALL CLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = CLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 29 + KLB = 0 + KUB = N - 1 + CALL CLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = CLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL CLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL CTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL CTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = CLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL CGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL CGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of CSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/dchkec.f b/lapack-netlib/TESTING/EIG/dchkec.f index 8549618842..c4451a627a 100644 --- a/lapack-netlib/TESTING/EIG/dchkec.f +++ b/lapack-netlib/TESTING/EIG/dchkec.f @@ -90,21 +90,23 @@ SUBROUTINE DCHKEC( THRESH, TSTERR, NIN, NOUT ) LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC DOUBLE PRECISION EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL DERREC, DGET31, DGET32, DGET33, DGET34, DGET35, - $ DGET36, DGET37, DGET38, DGET39, DGET40 + $ DGET36, DGET37, DGET38, DGET39, DGET40, DSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -153,10 +155,24 @@ SUBROUTINE DCHKEC( THRESH, TSTERR, NIN, NOUT ) WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL DGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL DGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL DSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL DGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ SUBROUTINE DCHKEC( THRESH, TSTERR, NIN, NOUT ) 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in DTGEXC: RMAX =', D12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'DTRSYL and DTRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in DTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in DTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of DCHKEC * diff --git a/lapack-netlib/TESTING/EIG/dchkhs.f b/lapack-netlib/TESTING/EIG/dchkhs.f index 2e57498965..79ba960086 100644 --- a/lapack-netlib/TESTING/EIG/dchkhs.f +++ b/lapack-netlib/TESTING/EIG/dchkhs.f @@ -23,7 +23,7 @@ * INTEGER ISEED( 4 ), IWORK( * ), NN( * ) * DOUBLE PRECISION A( LDA, * ), EVECTL( LDU, * ), * $ EVECTR( LDU, * ), EVECTX( LDU, * ), -* $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 14 ), +* $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 16 ), * $ T1( LDA, * ), T2( LDA, * ), TAU( * ), * $ U( LDU, * ), UU( LDU, * ), UZ( LDU, * ), * $ WI1( * ), WI2( * ), WI3( * ), WORK( * ), @@ -49,15 +49,21 @@ *> T is "quasi-triangular", and the eigenvalue vector W. *> *> DTREVC computes the left and right eigenvector matrices -*> L and R for T. +*> L and R for T. L is lower quasi-triangular, and R is +*> upper quasi-triangular. *> *> DHSEIN computes the left and right eigenvector matrices *> Y and X for H, using inverse iteration. *> +*> DTREVC3 computes left and right eigenvector matrices +*> from a Schur matrix T and backtransforms them with Z +*> to eigenvector matrices L and R for A. L and R are +*> GE matrices. +*> *> When DCHKHS is called, a number of matrix "sizes" ("n's") and a *> number of matrix "types" are specified. For each size ("n") *> and each type of matrix, one matrix will be generated and used -*> to test the nonsymmetric eigenroutines. For each matrix, 14 +*> to test the nonsymmetric eigenroutines. For each matrix, 16 *> tests will be performed: *> *> (1) | A - U H U**T | / ( |A| n ulp ) @@ -88,6 +94,10 @@ *> *> (14) | Y**H A - W**H Y | / ( |A| |Y| ulp ) *> +*> (15) | AR - RW | / ( |A| |R| ulp ) +*> +*> (16) | LA - WL | / ( |A| |L| ulp ) +*> *> The "sizes" are specified by an array NN(1:NSIZES); the value of *> each element NN(j) specifies one size. *> The "types" are specified by a logical array DOTYPE( 1:NTYPES ); @@ -331,7 +341,7 @@ *> Workspace. *> Modified. *> -*> RESULT - DOUBLE PRECISION array, dimension (14) +*> RESULT - DOUBLE PRECISION array, dimension (16) *> The values computed by the fourteen tests described above. *> The values are currently limited to 1/ulp, to avoid *> overflow. @@ -423,7 +433,7 @@ SUBROUTINE DCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, INTEGER ISEED( 4 ), IWORK( * ), NN( * ) DOUBLE PRECISION A( LDA, * ), EVECTL( LDU, * ), $ EVECTR( LDU, * ), EVECTX( LDU, * ), - $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 14 ), + $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 16 ), $ T1( LDA, * ), T2( LDA, * ), TAU( * ), $ U( LDU, * ), UU( LDU, * ), UZ( LDU, * ), $ WI1( * ), WI2( * ), WI3( * ), WORK( * ), @@ -461,7 +471,7 @@ SUBROUTINE DCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, EXTERNAL DCOPY, DGEHRD, DGEMM, DGET10, DGET22, DHSEIN, $ DHSEQR, DHST01, DLABAD, DLACPY, DLAFTS, DLASET, $ DLASUM, DLATME, DLATMR, DLATMS, DORGHR, DORMHR, - $ DTREVC, XERBLA + $ DTREVC, DTREVC3, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, SQRT @@ -561,7 +571,7 @@ SUBROUTINE DCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * * Initialize RESULT * - DO 30 J = 1, 14 + DO 30 J = 1, 16 RESULT( J ) = ZERO 30 CONTINUE * @@ -1108,6 +1118,64 @@ SUBROUTINE DCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, $ RESULT( 14 ) = DUMMA( 3 )*ANINV END IF * +* Compute Left and Right Eigenvectors of A +* +* Compute a Right eigenvector matrix: +* + NTEST = 15 + RESULT( 15 ) = ULPINV +* + CALL DLACPY( ' ', N, N, UZ, LDU, EVECTR, LDU ) +* + CALL DTREVC3( 'Right', 'Back', SELECT, N, T1, LDA, DUMMA, + $ LDU, EVECTR, LDU, N, IN, WORK, NWORK, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'DTREVC3(R,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 15: | AR - RW | / ( |A| |R| ulp ) +* +* (from Schur decomposition) +* + CALL DGET22( 'N', 'N', 'N', N, A, LDA, EVECTR, LDU, WR1, + $ WI1, WORK, DUMMA( 1 ) ) + RESULT( 15 ) = DUMMA( 1 ) + IF( DUMMA( 2 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Right', 'DTREVC3', + $ DUMMA( 2 ), N, JTYPE, IOLDSD + END IF +* +* Compute a Left eigenvector matrix: +* + NTEST = 16 + RESULT( 16 ) = ULPINV +* + CALL DLACPY( ' ', N, N, UZ, LDU, EVECTL, LDU ) +* + CALL DTREVC3( 'Left', 'Back', SELECT, N, T1, LDA, EVECTL, + $ LDU, DUMMA, LDU, N, IN, WORK, NWORK, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'DTREVC3(L,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 16: | LA - WL | / ( |A| |L| ulp ) +* +* (from Schur decomposition) +* + CALL DGET22( 'Trans', 'N', 'Conj', N, A, LDA, EVECTL, LDU, + $ WR1, WI1, WORK, DUMMA( 3 ) ) + RESULT( 16 ) = DUMMA( 3 ) + IF( DUMMA( 4 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Left', 'DTREVC3', DUMMA( 4 ), + $ N, JTYPE, IOLDSD + END IF +* * End of Loop -- Check for RESULT(j) > THRESH * 250 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f index 0b49c8404a..2e9d3c643e 100644 --- a/lapack-netlib/TESTING/EIG/ddrvsg.f +++ b/lapack-netlib/TESTING/EIG/ddrvsg.f @@ -645,8 +645,8 @@ SUBROUTINE DDRVSG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*DLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*DLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/derrec.f b/lapack-netlib/TESTING/EIG/derrec.f index d5863ad426..f11f488878 100644 --- a/lapack-netlib/TESTING/EIG/derrec.f +++ b/lapack-netlib/TESTING/EIG/derrec.f @@ -23,7 +23,7 @@ *> *> DERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> DTRSYL, DTREXC, DTRSNA and DTRSEN. +*> DTRSYL, DTRSYL3, DTREXC, DTRSNA and DTRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ SUBROUTINE DERREC( PATH, NUNIT ) $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL + EXTERNAL CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL, DTRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ SUBROUTINE DERREC( PATH, NUNIT ) CALL CHKXER( 'DTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test DTRSYL3 +* + SRNAMT = 'DTRSYL3' + INFOT = 1 + CALL DTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test DTREXC * SRNAMT = 'DTREXC' diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f index 6df5178253..11a9320526 100644 --- a/lapack-netlib/TESTING/EIG/derred.f +++ b/lapack-netlib/TESTING/EIG/derred.f @@ -99,7 +99,7 @@ SUBROUTINE DERRED( PATH, NUNIT ) * .. * .. External Subroutines .. EXTERNAL CHKXER, DGEES, DGEESX, DGEEV, DGEEVX, DGEJSV, - $ DGESDD, DGESVD, DGESVDX, DGESVQ + $ DGESDD, DGESVD, DGESVDX, DGESVDQ * .. * .. External Functions .. LOGICAL DSLECT, LSAMEN diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f new file mode 100644 index 0000000000..782d2cd42f --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dsyl01.f @@ -0,0 +1,288 @@ +*> \brief \b DSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DSYL01 tests DTRSYL and DTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements DGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual DTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual DTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times DTRSYL3 and DTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION, dimension (2) +*> RMAX(1) = Value of the largest test ratio of DTRSYL +*> RMAX(2) = Value of the largest test ratio of DTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times DTRSYL returns an expected INFO +*> NINFO(2) = No. of times DTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 245, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + DOUBLE PRECISION A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 126 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, DLANGE + EXTERNAL DLAMCH, DLANGE +* .. +* .. External Subroutines .. + EXTERNAL DLATMR, DLACPY, DGEMM, DTRSYL, DTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.000001D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL DLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = DLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL DLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = DLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL DLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL DTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL DTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = DLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL DGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL DGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of DSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/schkec.f b/lapack-netlib/TESTING/EIG/schkec.f index e6123e1ad9..59abb24664 100644 --- a/lapack-netlib/TESTING/EIG/schkec.f +++ b/lapack-netlib/TESTING/EIG/schkec.f @@ -90,21 +90,23 @@ SUBROUTINE SCHKEC( THRESH, TSTERR, NIN, NOUT ) LOGICAL OK CHARACTER*3 PATH INTEGER KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC, - $ KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2, - $ LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR, - $ NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC + $ KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC, + $ LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL, + $ NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC, + $ LTGEXC REAL EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2, - $ RTREXC, RTRSYL, SFMIN, RTGEXC + $ RTREXC, SFMIN, RTGEXC * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ), - $ NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ), + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ), + $ NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ), $ NTRSNA( 3 ) - REAL RTRSEN( 3 ), RTRSNA( 3 ) + REAL RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. EXTERNAL SERREC, SGET31, SGET32, SGET33, SGET34, SGET35, - $ SGET36, SGET37, SGET38, SGET39, SGET40 + $ SGET36, SGET37, SGET38, SGET39, SGET40, SSYL01 * .. * .. External Functions .. REAL SLAMCH @@ -153,10 +155,24 @@ SUBROUTINE SCHKEC( THRESH, TSTERR, NIN, NOUT ) WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC END IF * - CALL SGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL ) - IF( RTRSYL.GT.THRESH ) THEN + CALL SGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL SSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL SGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -227,7 +243,13 @@ SUBROUTINE SCHKEC( THRESH, TSTERR, NIN, NOUT ) 9987 FORMAT( ' Routines pass computational tests if test ratio is les', $ 's than', F8.2, / / ) 9986 FORMAT( ' Error in STGEXC: RMAX =', E12.3, / ' LMAX = ', I8, ' N', - $ 'INFO=', I8, ' KNT=', I8 ) + $ 'INFO=', 2I8, ' KNT=', I8 ) + 9972 FORMAT( 'STRSYL and STRSYL3 compute an inconsistent result ', + $ 'factor in ', I8, ' tests.') + 9971 FORMAT( 'Error in STRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9970 FORMAT( 'Error in STRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) * * End of SCHKEC * diff --git a/lapack-netlib/TESTING/EIG/schkhs.f b/lapack-netlib/TESTING/EIG/schkhs.f index ab0e901383..bf8eb1b409 100644 --- a/lapack-netlib/TESTING/EIG/schkhs.f +++ b/lapack-netlib/TESTING/EIG/schkhs.f @@ -23,7 +23,7 @@ * INTEGER ISEED( 4 ), IWORK( * ), NN( * ) * REAL A( LDA, * ), EVECTL( LDU, * ), * $ EVECTR( LDU, * ), EVECTX( LDU, * ), -* $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 14 ), +* $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 16 ), * $ T1( LDA, * ), T2( LDA, * ), TAU( * ), * $ U( LDU, * ), UU( LDU, * ), UZ( LDU, * ), * $ WI1( * ), WI2( * ), WI3( * ), WORK( * ), @@ -54,10 +54,15 @@ *> SHSEIN computes the left and right eigenvector matrices *> Y and X for H, using inverse iteration. *> +*> STREVC3 computes left and right eigenvector matrices +*> from a Schur matrix T and backtransforms them with Z +*> to eigenvector matrices L and R for A. L and R are +*> GE matrices. +*> *> When SCHKHS is called, a number of matrix "sizes" ("n's") and a *> number of matrix "types" are specified. For each size ("n") *> and each type of matrix, one matrix will be generated and used -*> to test the nonsymmetric eigenroutines. For each matrix, 14 +*> to test the nonsymmetric eigenroutines. For each matrix, 16 *> tests will be performed: *> *> (1) | A - U H U**T | / ( |A| n ulp ) @@ -88,6 +93,10 @@ *> *> (14) | Y**H A - W**H Y | / ( |A| |Y| ulp ) *> +*> (15) | AR - RW | / ( |A| |R| ulp ) +*> +*> (16) | LA - WL | / ( |A| |L| ulp ) +*> *> The "sizes" are specified by an array NN(1:NSIZES); the value of *> each element NN(j) specifies one size. *> The "types" are specified by a logical array DOTYPE( 1:NTYPES ); @@ -331,7 +340,7 @@ *> Workspace. *> Modified. *> -*> RESULT - REAL array, dimension (14) +*> RESULT - REAL array, dimension (16) *> The values computed by the fourteen tests described above. *> The values are currently limited to 1/ulp, to avoid *> overflow. @@ -423,7 +432,7 @@ SUBROUTINE SCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, INTEGER ISEED( 4 ), IWORK( * ), NN( * ) REAL A( LDA, * ), EVECTL( LDU, * ), $ EVECTR( LDU, * ), EVECTX( LDU, * ), - $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 14 ), + $ EVECTY( LDU, * ), H( LDA, * ), RESULT( 16 ), $ T1( LDA, * ), T2( LDA, * ), TAU( * ), $ U( LDU, * ), UU( LDU, * ), UZ( LDU, * ), $ WI1( * ), WI2( * ), WI3( * ), WORK( * ), @@ -461,7 +470,7 @@ SUBROUTINE SCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, EXTERNAL SCOPY, SGEHRD, SGEMM, SGET10, SGET22, SHSEIN, $ SHSEQR, SHST01, SLABAD, SLACPY, SLAFTS, SLASET, $ SLASUM, SLATME, SLATMR, SLATMS, SORGHR, SORMHR, - $ STREVC, XERBLA + $ STREVC, STREVC3, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, REAL, SQRT @@ -561,7 +570,7 @@ SUBROUTINE SCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * * Initialize RESULT * - DO 30 J = 1, 14 + DO 30 J = 1, 16 RESULT( J ) = ZERO 30 CONTINUE * @@ -1108,6 +1117,64 @@ SUBROUTINE SCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, $ RESULT( 14 ) = DUMMA( 3 )*ANINV END IF * +* Compute Left and Right Eigenvectors of A +* +* Compute a Right eigenvector matrix: +* + NTEST = 15 + RESULT( 15 ) = ULPINV +* + CALL SLACPY( ' ', N, N, UZ, LDU, EVECTR, LDU ) +* + CALL STREVC3( 'Right', 'Back', SELECT, N, T1, LDA, DUMMA, + $ LDU, EVECTR, LDU, N, IN, WORK, NWORK, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'STREVC3(R,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 15: | AR - RW | / ( |A| |R| ulp ) +* +* (from Schur decomposition) +* + CALL SGET22( 'N', 'N', 'N', N, A, LDA, EVECTR, LDU, WR1, + $ WI1, WORK, DUMMA( 1 ) ) + RESULT( 15 ) = DUMMA( 1 ) + IF( DUMMA( 2 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Right', 'STREVC3', + $ DUMMA( 2 ), N, JTYPE, IOLDSD + END IF +* +* Compute a Left eigenvector matrix: +* + NTEST = 16 + RESULT( 16 ) = ULPINV +* + CALL SLACPY( ' ', N, N, UZ, LDU, EVECTL, LDU ) +* + CALL STREVC3( 'Left', 'Back', SELECT, N, T1, LDA, EVECTL, + $ LDU, DUMMA, LDU, N, IN, WORK, NWORK, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'STREVC3(L,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 16: | LA - WL | / ( |A| |L| ulp ) +* +* (from Schur decomposition) +* + CALL SGET22( 'Trans', 'N', 'Conj', N, A, LDA, EVECTL, LDU, + $ WR1, WI1, WORK, DUMMA( 3 ) ) + RESULT( 16 ) = DUMMA( 3 ) + IF( DUMMA( 4 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Left', 'STREVC3', DUMMA( 4 ), + $ N, JTYPE, IOLDSD + END IF +* * End of Loop -- Check for RESULT(j) > THRESH * 250 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f index 4a57223c80..877579bcd0 100644 --- a/lapack-netlib/TESTING/EIG/sdrvsg.f +++ b/lapack-netlib/TESTING/EIG/sdrvsg.f @@ -645,8 +645,8 @@ SUBROUTINE SDRVSG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*SLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*SLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/serrec.f b/lapack-netlib/TESTING/EIG/serrec.f index 249f0e6424..9a7ceb3627 100644 --- a/lapack-netlib/TESTING/EIG/serrec.f +++ b/lapack-netlib/TESTING/EIG/serrec.f @@ -23,7 +23,7 @@ *> *> SERREC tests the error exits for the routines for eigen- condition *> estimation for REAL matrices: -*> STRSYL, STREXC, STRSNA and STRSEN. +*> STRSYL, STRSYL3, STREXC, STRSNA and STRSEN. *> \endverbatim * * Arguments: @@ -82,7 +82,7 @@ SUBROUTINE SERREC( PATH, NUNIT ) $ WI( NMAX ), WORK( NMAX ), WR( NMAX ) * .. * .. External Subroutines .. - EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL + EXTERNAL CHKXER, STREXC, STRSEN, STRSNA, STRSYL, STRSYL3 * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -141,6 +141,43 @@ SUBROUTINE SERREC( PATH, NUNIT ) CALL CHKXER( 'STRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test STRSYL3 +* + SRNAMT = 'STRSYL3' + INFOT = 1 + CALL STRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ IWORK, NMAX, WORK, NMAX, INFO ) + CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test STREXC * SRNAMT = 'STREXC' diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f new file mode 100644 index 0000000000..22d089dc81 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/ssyl01.f @@ -0,0 +1,288 @@ +*> \brief \b SSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* REAL THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* REAL RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SSYL01 tests STRSYL and STRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> A and B are assumed to be in Schur canonical form, op() represents an +*> optional transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements SGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is REAL +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual STRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual STRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times STRSYL3 and STRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is REAL, dimension (2) +*> RMAX(1) = Value of the largest test ratio of STRSYL +*> RMAX(2) = Value of the largest test ratio of STRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times STRSYL returns an expected INFO +*> NINFO(2) = No. of times STRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + REAL THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + REAL RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 101, MAXN = 138, LDSWORK = 18 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, LIWORK, M, N + REAL ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM +* .. +* .. Local Arrays .. + REAL A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ), DUM( MAXN ), + $ SWORK( LDSWORK, 54 ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 ) +* .. +* .. External Functions .. + LOGICAL SISNAN + REAL SLAMCH, SLANGE + EXTERNAL SISNAN, SLAMCH, SLANGE +* .. +* .. External Subroutines .. + EXTERNAL SLATMR, SLACPY, SGEMM, STRSYL, STRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, REAL, MAX +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = SLAMCH( 'P' ) + SMLNUM = SLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* + VM( 1 ) = ONE + VM( 2 ) = 0.05E+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + DO I = 1, 4 + ISEED( I ) = 1 + END DO + SCALE = ONE + SCALE3 = ONE + LIWORK = MAXM + MAXN + 2 + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + DO I = 1, 4 + ISEED( I ) = 1 + END DO + DO M = 32, MAXM, 71 + KLA = 0 + KUA = M - 1 + CALL SLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = SLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL SLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, IINFO ) + BNRM = SLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL SLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, ONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) THEN + TRANA = 'N' + END IF + IF( ITRANA.EQ.2 ) THEN + TRANA = 'T' + END IF + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) THEN + TRANB = 'N' + END IF + IF( ITRANB.EQ.2 ) THEN + TRANB = 'T' + END IF + KNT = KNT + 1 +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL STRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ C, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, C, MAXM ) + RES1 = SLANGE( 'M', M, N, C, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL STRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, IWORK, LIWORK, + $ SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = SLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = ONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = ONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL SGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL SGEMM( 'N', TRANB, M, N, N, + $ REAL( ISGN )*RMUL, X, MAXM, B, + $ MAXN, ONE, CC, MAXM ) + RES1 = SLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( RMUL*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. SISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of SSYL01 +* + END diff --git a/lapack-netlib/TESTING/EIG/zchkec.f b/lapack-netlib/TESTING/EIG/zchkec.f index 1e1c29e0d0..62a76d3574 100644 --- a/lapack-netlib/TESTING/EIG/zchkec.f +++ b/lapack-netlib/TESTING/EIG/zchkec.f @@ -88,17 +88,17 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) * .. Local Scalars .. LOGICAL OK CHARACTER*3 PATH - INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL, - $ NTESTS, NTREXC, NTRSYL - DOUBLE PRECISION EPS, RTREXC, RTRSYL, SFMIN + INTEGER KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3, + $ LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL + DOUBLE PRECISION EPS, RTREXC, SFMIN * .. * .. Local Arrays .. - INTEGER LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ), - $ NTRSNA( 3 ) - DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ) + INTEGER FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ), + $ LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 ) + DOUBLE PRECISION RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 ) * .. * .. External Subroutines .. - EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38 + EXTERNAL ZERREC, ZGET35, ZGET36, ZGET37, ZGET38, ZSYL01 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -120,10 +120,24 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) $ CALL ZERREC( PATH, NOUT ) * OK = .TRUE. - CALL ZGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN ) - IF( RTRSYL.GT.THRESH ) THEN + CALL ZGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN ) + IF( RTRSYL( 1 ).GT.THRESH ) THEN OK = .FALSE. - WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL + WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL + END IF +* + CALL ZSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 ) + IF( FTRSYL( 1 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH + END IF + IF( FTRSYL( 2 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH + END IF + IF( FTRSYL( 3 ).GT.0 ) THEN + OK = .FALSE. + WRITE( NOUT, FMT = 9972 )FTRSYL( 3 ) END IF * CALL ZGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN ) @@ -148,7 +162,7 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) WRITE( NOUT, FMT = 9996 )RTRSEN, LTRSEN, NTRSEN, KTRSEN END IF * - NTESTS = KTRSYL + KTREXC + KTRSNA + KTRSEN + NTESTS = KTRSYL + KTRSYL3 + KTREXC + KTRSNA + KTRSEN IF( OK ) $ WRITE( NOUT, FMT = 9995 )PATH, NTESTS * @@ -169,6 +183,12 @@ SUBROUTINE ZCHKEC( THRESH, TSTERR, NIN, NOUT ) $ / ' Safe minimum (SFMIN) = ', D16.6, / ) 9992 FORMAT( ' Routines pass computational tests if test ratio is ', $ 'less than', F8.2, / / ) + 9970 FORMAT( 'Error in ZTRSYL: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9971 FORMAT( 'Error in ZTRSYL3: ', I8, ' tests fail the threshold.', / + $ 'Maximum test ratio =', D12.3, ' threshold =', D12.3 ) + 9972 FORMAT( 'ZTRSYL and ZTRSYL3 compute an inconsistent scale ', + $ 'factor in ', I8, ' tests.') RETURN * * End of ZCHKEC diff --git a/lapack-netlib/TESTING/EIG/zchkhs.f b/lapack-netlib/TESTING/EIG/zchkhs.f index 52962a0414..f5ae9b7f3c 100644 --- a/lapack-netlib/TESTING/EIG/zchkhs.f +++ b/lapack-netlib/TESTING/EIG/zchkhs.f @@ -21,7 +21,7 @@ * .. Array Arguments .. * LOGICAL DOTYPE( * ), SELECT( * ) * INTEGER ISEED( 4 ), IWORK( * ), NN( * ) -* DOUBLE PRECISION RESULT( 14 ), RWORK( * ) +* DOUBLE PRECISION RESULT( 16 ), RWORK( * ) * COMPLEX*16 A( LDA, * ), EVECTL( LDU, * ), * $ EVECTR( LDU, * ), EVECTX( LDU, * ), * $ EVECTY( LDU, * ), H( LDA, * ), T1( LDA, * ), @@ -64,10 +64,15 @@ *> eigenvectors of H. Y is lower triangular, and X is *> upper triangular. *> +*> ZTREVC3 computes left and right eigenvector matrices +*> from a Schur matrix T and backtransforms them with Z +*> to eigenvector matrices L and R for A. L and R are +*> GE matrices. +*> *> When ZCHKHS is called, a number of matrix "sizes" ("n's") and a *> number of matrix "types" are specified. For each size ("n") *> and each type of matrix, one matrix will be generated and used -*> to test the nonsymmetric eigenroutines. For each matrix, 14 +*> to test the nonsymmetric eigenroutines. For each matrix, 16 *> tests will be performed: *> *> (1) | A - U H U**H | / ( |A| n ulp ) @@ -98,6 +103,10 @@ *> *> (14) | Y**H A - W**H Y | / ( |A| |Y| ulp ) *> +*> (15) | AR - RW | / ( |A| |R| ulp ) +*> +*> (16) | LA - WL | / ( |A| |L| ulp ) +*> *> The "sizes" are specified by an array NN(1:NSIZES); the value of *> each element NN(j) specifies one size. *> The "types" are specified by a logical array DOTYPE( 1:NTYPES ); @@ -331,7 +340,7 @@ *> Workspace. Could be equivalenced to IWORK, but not RWORK. *> Modified. *> -*> RESULT - DOUBLE PRECISION array, dimension (14) +*> RESULT - DOUBLE PRECISION array, dimension (16) *> The values computed by the fourteen tests described above. *> The values are currently limited to 1/ulp, to avoid *> overflow. @@ -421,7 +430,7 @@ SUBROUTINE ZCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * .. Array Arguments .. LOGICAL DOTYPE( * ), SELECT( * ) INTEGER ISEED( 4 ), IWORK( * ), NN( * ) - DOUBLE PRECISION RESULT( 14 ), RWORK( * ) + DOUBLE PRECISION RESULT( 16 ), RWORK( * ) COMPLEX*16 A( LDA, * ), EVECTL( LDU, * ), $ EVECTR( LDU, * ), EVECTX( LDU, * ), $ EVECTY( LDU, * ), H( LDA, * ), T1( LDA, * ), @@ -464,7 +473,7 @@ SUBROUTINE ZCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, EXTERNAL DLABAD, DLAFTS, DLASUM, XERBLA, ZCOPY, ZGEHRD, $ ZGEMM, ZGET10, ZGET22, ZHSEIN, ZHSEQR, ZHST01, $ ZLACPY, ZLASET, ZLATME, ZLATMR, ZLATMS, ZTREVC, - $ ZUNGHR, ZUNMHR + $ ZTREVC3, ZUNGHR, ZUNMHR * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, MAX, MIN, SQRT @@ -1067,6 +1076,66 @@ SUBROUTINE ZCHKHS( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, $ RESULT( 14 ) = DUMMA( 3 )*ANINV END IF * +* Compute Left and Right Eigenvectors of A +* +* Compute a Right eigenvector matrix: +* + NTEST = 15 + RESULT( 15 ) = ULPINV +* + CALL ZLACPY( ' ', N, N, UZ, LDU, EVECTR, LDU ) +* + CALL ZTREVC3( 'Right', 'Back', SELECT, N, T1, LDA, CDUMMA, + $ LDU, EVECTR, LDU, N, IN, WORK, NWORK, RWORK, + $ N, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'ZTREVC3(R,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 15: | AR - RW | / ( |A| |R| ulp ) +* +* (from Schur decomposition) +* + CALL ZGET22( 'N', 'N', 'N', N, A, LDA, EVECTR, LDU, W1, + $ WORK, RWORK, DUMMA( 1 ) ) + RESULT( 15 ) = DUMMA( 1 ) + IF( DUMMA( 2 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Right', 'ZTREVC3', + $ DUMMA( 2 ), N, JTYPE, IOLDSD + END IF +* +* Compute a Left eigenvector matrix: +* + NTEST = 16 + RESULT( 16 ) = ULPINV +* + CALL ZLACPY( ' ', N, N, UZ, LDU, EVECTL, LDU ) +* + CALL ZTREVC3( 'Left', 'Back', SELECT, N, T1, LDA, EVECTL, + $ LDU, CDUMMA, LDU, N, IN, WORK, NWORK, RWORK, + $ N, IINFO ) + IF( IINFO.NE.0 ) THEN + WRITE( NOUNIT, FMT = 9999 )'ZTREVC3(L,B)', IINFO, N, + $ JTYPE, IOLDSD + INFO = ABS( IINFO ) + GO TO 250 + END IF +* +* Test 16: | LA - WL | / ( |A| |L| ulp ) +* +* (from Schur decomposition) +* + CALL ZGET22( 'Conj', 'N', 'Conj', N, A, LDA, EVECTL, LDU, + $ W1, WORK, RWORK, DUMMA( 3 ) ) + RESULT( 16 ) = DUMMA( 3 ) + IF( DUMMA( 4 ).GT.THRESH ) THEN + WRITE( NOUNIT, FMT = 9998 )'Left', 'ZTREVC3', DUMMA( 4 ), + $ N, JTYPE, IOLDSD + END IF +* * End of Loop -- Check for RESULT(j) > THRESH * 240 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f index 336514a3fe..71f1d6371b 100644 --- a/lapack-netlib/TESTING/EIG/zdrvsg.f +++ b/lapack-netlib/TESTING/EIG/zdrvsg.f @@ -663,8 +663,8 @@ SUBROUTINE ZDRVSG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, IL = 1 IU = N ELSE - IL = 1 + ( N-1 )*DLARND( 1, ISEED2 ) - IU = 1 + ( N-1 )*DLARND( 1, ISEED2 ) + IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) + IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) ) IF( IL.GT.IU ) THEN ITEMP = IL IL = IU diff --git a/lapack-netlib/TESTING/EIG/zerrec.f b/lapack-netlib/TESTING/EIG/zerrec.f index dc6129da91..e1938f57d1 100644 --- a/lapack-netlib/TESTING/EIG/zerrec.f +++ b/lapack-netlib/TESTING/EIG/zerrec.f @@ -23,7 +23,7 @@ *> *> ZERREC tests the error exits for the routines for eigen- condition *> estimation for DOUBLE PRECISION matrices: -*> ZTRSYL, ZTREXC, ZTRSNA and ZTRSEN. +*> ZTRSYL, ZTRSYL3, ZTREXC, ZTRSNA and ZTRSEN. *> \endverbatim * * Arguments: @@ -77,7 +77,7 @@ SUBROUTINE ZERREC( PATH, NUNIT ) * .. * .. Local Arrays .. LOGICAL SEL( NMAX ) - DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ) + DOUBLE PRECISION RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX ) COMPLEX*16 A( NMAX, NMAX ), B( NMAX, NMAX ), $ C( NMAX, NMAX ), WORK( LW ), X( NMAX ) * .. @@ -141,6 +141,43 @@ SUBROUTINE ZERREC( PATH, NUNIT ) CALL CHKXER( 'ZTRSYL', INFOT, NOUT, LERR, OK ) NT = NT + 8 * +* Test ZTRSYL3 +* + SRNAMT = 'ZTRSYL3' + INFOT = 1 + CALL ZTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE, + $ SWORK, NMAX, INFO ) + CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK ) + NT = NT + 8 +* * Test ZTREXC * SRNAMT = 'ZTREXC' diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f index d1219c02b9..1876c1f1d7 100644 --- a/lapack-netlib/TESTING/EIG/zerred.f +++ b/lapack-netlib/TESTING/EIG/zerred.f @@ -100,7 +100,7 @@ SUBROUTINE ZERRED( PATH, NUNIT ) * .. * .. External Subroutines .. EXTERNAL CHKXER, ZGEES, ZGEESX, ZGEEV, ZGEEVX, ZGESVJ, - $ ZGESDD, ZGESVD, ZGESVDX, ZGESVQ + $ ZGESDD, ZGESVD, ZGESVDX, ZGESVDQ * .. * .. External Functions .. LOGICAL LSAMEN, ZSLECT diff --git a/lapack-netlib/TESTING/EIG/zget37.f b/lapack-netlib/TESTING/EIG/zget37.f index 63680e8556..5013fbdd9f 100644 --- a/lapack-netlib/TESTING/EIG/zget37.f +++ b/lapack-netlib/TESTING/EIG/zget37.f @@ -265,7 +265,7 @@ SUBROUTINE ZGET37( RMAX, LMAX, NINFO, KNT, NIN ) 100 CONTINUE WSRT( KMIN ) = WSRT( I ) WSRT( I ) = VMIN - VCMIN = WTMP( I ) + VCMIN = DBLE( WTMP( I ) ) WTMP( I ) = W( KMIN ) WTMP( KMIN ) = VCMIN VMIN = STMP( KMIN ) diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f index d254f85e9c..b927a502fa 100644 --- a/lapack-netlib/TESTING/EIG/zhet21.f +++ b/lapack-netlib/TESTING/EIG/zhet21.f @@ -304,9 +304,9 @@ SUBROUTINE ZHET21( ITYPE, UPLO, N, KBAND, A, LDA, D, E, U, LDU, V, 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 2, N - 1 + DO 20 J = 1, N - 1 CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, - $ U( 1, J-1 ), 1, WORK, N ) + $ U( 1, J+1 ), 1, WORK, N ) 20 CONTINUE END IF WNORM = ZLANHE( '1', CUPLO, N, WORK, N, RWORK ) diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f new file mode 100644 index 0000000000..329f39dc4f --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zsyl01.f @@ -0,0 +1,294 @@ +*> \brief \b ZSYL01 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) +* +* .. Scalar Arguments .. +* INTEGER KNT +* DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. +* INTEGER NFAIL( 3 ), NINFO( 2 ) +* DOUBLE PRECISION RMAX( 2 ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZSYL01 tests ZTRSYL and ZTRSYL3, routines for solving the Sylvester matrix +*> equation +*> +*> op(A)*X + ISGN*X*op(B) = scale*C, +*> +*> where op(A) and op(B) are both upper triangular form, op() represents an +*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output +*> less than or equal to 1, chosen to avoid overflow in X. +*> +*> The test code verifies that the following residual does not exceed +*> the provided threshold: +*> +*> norm(op(A)*X + ISGN*X*op(B) - scale*C) / +*> (EPS*max(norm(A),norm(B))*norm(X)) +*> +*> This routine complements ZGET35 by testing with larger, +*> random matrices, of which some require rescaling of X to avoid overflow. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] THRESH +*> \verbatim +*> THRESH is DOUBLE PRECISION +*> A test will count as "failed" if the residual, computed as +*> described above, exceeds THRESH. +*> \endverbatim +*> +*> \param[out] NFAIL +*> \verbatim +*> NFAIL is INTEGER array, dimension (3) +*> NFAIL(1) = No. of times residual ZTRSYL exceeds threshold THRESH +*> NFAIL(2) = No. of times residual ZTRSYL3 exceeds threshold THRESH +*> NFAIL(3) = No. of times ZTRSYL3 and ZTRSYL deviate +*> \endverbatim +*> +*> \param[out] RMAX +*> \verbatim +*> RMAX is DOUBLE PRECISION array, dimension (2) +*> RMAX(1) = Value of the largest test ratio of ZTRSYL +*> RMAX(2) = Value of the largest test ratio of ZTRSYL3 +*> \endverbatim +*> +*> \param[out] NINFO +*> \verbatim +*> NINFO is INTEGER array, dimension (2) +*> NINFO(1) = No. of times ZTRSYL returns an expected INFO +*> NINFO(2) = No. of times ZTRSYL3 returns an expected INFO +*> \endverbatim +*> +*> \param[out] KNT +*> \verbatim +*> KNT is INTEGER +*> Total number of examples tested. +*> \endverbatim + +* +* -- LAPACK test routine -- + SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER KNT + DOUBLE PRECISION THRESH +* .. +* .. Array Arguments .. + INTEGER NFAIL( 3 ), NINFO( 2 ) + DOUBLE PRECISION RMAX( 2 ) +* .. +* +* ===================================================================== +* .. +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D0, 0.0D+0 ) ) + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) + INTEGER MAXM, MAXN, LDSWORK + PARAMETER ( MAXM = 185, MAXN = 192, LDSWORK = 36 ) +* .. +* .. Local Scalars .. + CHARACTER TRANA, TRANB + INTEGER I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA, + $ KUA, KLB, KUB, M, N + DOUBLE PRECISION ANRM, BNRM, BIGNUM, EPS, RES, RES1, + $ SCALE, SCALE3, SMLNUM, TNRM, XNRM + COMPLEX*16 RMUL +* .. +* .. Local Arrays .. + COMPLEX*16 A( MAXM, MAXM ), B( MAXN, MAXN ), + $ C( MAXM, MAXN ), CC( MAXM, MAXN ), + $ X( MAXM, MAXN ), + $ DUML( MAXM ), DUMR( MAXN ), + $ D( MAX( MAXM, MAXN ) ) + DOUBLE PRECISION SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 ) + INTEGER ISEED( 4 ), IWORK( MAXM + MAXN + 2 ) +* .. +* .. External Functions .. + LOGICAL DISNAN + DOUBLE PRECISION DLAMCH, ZLANGE + EXTERNAL DISNAN, DLAMCH, ZLANGE +* .. +* .. External Subroutines .. + EXTERNAL ZLATMR, ZLACPY, ZGEMM, ZTRSYL, ZTRSYL3 +* .. +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Get machine parameters +* + EPS = DLAMCH( 'P' ) + SMLNUM = DLAMCH( 'S' ) / EPS + BIGNUM = ONE / SMLNUM +* +* Expect INFO = 0 + VM( 1 ) = ONE +* Expect INFO = 1 + VM( 2 ) = 0.05D+0 +* +* Begin test loop +* + NINFO( 1 ) = 0 + NINFO( 2 ) = 0 + NFAIL( 1 ) = 0 + NFAIL( 2 ) = 0 + NFAIL( 3 ) = 0 + RMAX( 1 ) = ZERO + RMAX( 2 ) = ZERO + KNT = 0 + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + SCALE = ONE + SCALE3 = ONE + DO J = 1, 2 + DO ISGN = -1, 1, 2 +* Reset seed (overwritten by LATMR) + ISEED( 1 ) = 1 + ISEED( 2 ) = 1 + ISEED( 3 ) = 1 + ISEED( 4 ) = 1 + DO M = 32, MAXM, 51 + KLA = 0 + KUA = M - 1 + CALL ZLATMR( M, M, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLA, KUA, ZERO, + $ ONE, 'NO', A, MAXM, IWORK, + $ IINFO ) + DO I = 1, M + A( I, I ) = A( I, I ) * VM( J ) + END DO + ANRM = ZLANGE( 'M', M, M, A, MAXM, DUM ) + DO N = 51, MAXN, 47 + KLB = 0 + KUB = N - 1 + CALL ZLATMR( N, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, KLB, KUB, ZERO, + $ ONE, 'NO', B, MAXN, IWORK, + $ IINFO ) + DO I = 1, N + B( I, I ) = B( I, I ) * VM ( J ) + END DO + BNRM = ZLANGE( 'M', N, N, B, MAXN, DUM ) + TNRM = MAX( ANRM, BNRM ) + CALL ZLATMR( M, N, 'S', ISEED, 'N', D, + $ 6, ONE, CONE, 'T', 'N', + $ DUML, 1, ONE, DUMR, 1, ONE, + $ 'N', IWORK, M, N, ZERO, ONE, + $ 'NO', C, MAXM, IWORK, IINFO ) + DO ITRANA = 1, 2 + IF( ITRANA.EQ.1 ) + $ TRANA = 'N' + IF( ITRANA.EQ.2 ) + $ TRANA = 'C' + DO ITRANB = 1, 2 + IF( ITRANB.EQ.1 ) + $ TRANB = 'N' + IF( ITRANB.EQ.2 ) + $ TRANB = 'C' + KNT = KNT + 1 +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM) + CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE, IINFO ) + IF( IINFO.NE.0 ) + $ NINFO( 1 ) = NINFO( 1 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) + IF( RES.GT.THRESH ) + $ NFAIL( 1 ) = NFAIL( 1 ) + 1 + IF( RES.GT.RMAX( 1 ) ) + $ RMAX( 1 ) = RES +* + CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM ) + CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM ) + CALL ZTRSYL3( TRANA, TRANB, ISGN, M, N, + $ A, MAXM, B, MAXN, X, MAXM, + $ SCALE3, SWORK, LDSWORK, INFO) + IF( INFO.NE.0 ) + $ NINFO( 2 ) = NINFO( 2 ) + 1 + XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM ) + RMUL = CONE + IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN + IF( XNRM.GT.BIGNUM / TNRM ) THEN + RMUL = CONE / MAX( XNRM, TNRM ) + END IF + END IF + CALL ZGEMM( TRANA, 'N', M, N, M, RMUL, + $ A, MAXM, X, MAXM, -SCALE3*RMUL, + $ CC, MAXM ) + CALL ZGEMM( 'N', TRANB, M, N, N, + $ DBLE( ISGN )*RMUL, X, MAXM, B, + $ MAXN, CONE, CC, MAXM ) + RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM ) + RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM, + $ ( ( ABS( RMUL )*TNRM )*EPS )*XNRM ) +* Verify that TRSYL3 only flushes if TRSYL flushes (but +* there may be cases where TRSYL3 avoid flushing). + IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. + $ IINFO.NE.INFO ) THEN + NFAIL( 3 ) = NFAIL( 3 ) + 1 + END IF + IF( RES.GT.THRESH .OR. DISNAN( RES ) ) + $ NFAIL( 2 ) = NFAIL( 2 ) + 1 + IF( RES.GT.RMAX( 2 ) ) + $ RMAX( 2 ) = RES + END DO + END DO + END DO + END DO + END DO + END DO +* + RETURN +* +* End of ZSYL01 +* + END diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index 2cc0fba063..f0423a23b9 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -608,17 +608,18 @@ SUBROUTINE ALAHD( IOUNIT, PATH ) ELSE IF( LSAMEN( 2, P2, 'LS' ) ) THEN * * LS: Least Squares driver routines for -* LS, LSD, LSS, LSX and LSY. +* LS, LST, TSLS, LSD, LSS, LSX and LSY. * WRITE( IOUNIT, FMT = 9984 )PATH WRITE( IOUNIT, FMT = 9967 ) - WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1 + WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1, C1, C1 WRITE( IOUNIT, FMT = 9935 )1 WRITE( IOUNIT, FMT = 9931 )2 - WRITE( IOUNIT, FMT = 9933 )3 - WRITE( IOUNIT, FMT = 9935 )4 - WRITE( IOUNIT, FMT = 9934 )5 - WRITE( IOUNIT, FMT = 9932 )6 + WRITE( IOUNIT, FMT = 9919 ) + WRITE( IOUNIT, FMT = 9933 )7 + WRITE( IOUNIT, FMT = 9935 )8 + WRITE( IOUNIT, FMT = 9934 )9 + WRITE( IOUNIT, FMT = 9932 )10 WRITE( IOUNIT, FMT = 9920 ) WRITE( IOUNIT, FMT = '( '' Messages:'' )' ) * @@ -1048,10 +1049,11 @@ SUBROUTINE ALAHD( IOUNIT, PATH ) $ 'check if X is in the row space of A or A'' ', $ '(overdetermined case)' ) 9929 FORMAT( ' Test ratios (1-3: ', A1, 'TZRZF):' ) - 9920 FORMAT( 3X, ' 7-10: same as 3-6', 3X, ' 11-14: same as 3-6' ) - 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-6: ', A1, - $ 'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ', - $ A1, 'GETSLS)') + 9919 FORMAT( 3X, ' 3-4: same as 1-2', 3X, ' 5-6: same as 1-2' ) + 9920 FORMAT( 3X, ' 11-14: same as 7-10', 3X, ' 15-18: same as 7-10' ) + 9921 FORMAT( ' Test ratios:', / ' (1-2: ', A1, 'GELS, 3-4: ', A1, + $ 'GELST, 5-6: ', A1, 'GETSLS, 7-10: ', A1, 'GELSY, 11-14: ', + $ A1, 'GETSS, 15-18: ', A1, 'GELSD)' ) 9928 FORMAT( 7X, 'where ALPHA = ( 1 + SQRT( 17 ) ) / 8' ) 9927 FORMAT( 3X, I2, ': ABS( Largest element in L )', / 12X, $ ' - ( 1 / ( 1 - ALPHA ) ) + THRESH' ) diff --git a/lapack-netlib/TESTING/LIN/cchkpt.f b/lapack-netlib/TESTING/LIN/cchkpt.f index 2ec8020646..7dc367eebf 100644 --- a/lapack-netlib/TESTING/LIN/cchkpt.f +++ b/lapack-netlib/TESTING/LIN/cchkpt.f @@ -319,15 +319,15 @@ SUBROUTINE CCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, * elements. * IF( IZERO.EQ.1 ) THEN - D( 1 ) = Z( 2 ) + D( 1 ) = REAL( Z( 2 ) ) IF( N.GT.1 ) $ E( 1 ) = Z( 3 ) ELSE IF( IZERO.EQ.N ) THEN E( N-1 ) = Z( 1 ) - D( N ) = Z( 2 ) + D( N ) = REAL( Z( 2 ) ) ELSE E( IZERO-1 ) = Z( 1 ) - D( IZERO ) = Z( 2 ) + D( IZERO ) = REAL( Z( 2 ) ) E( IZERO ) = Z( 3 ) END IF END IF diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index ce1ecf7615..4b09361d8b 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS +*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -195,13 +195,13 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -210,9 +210,9 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04, - $ CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS, - $ CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI, - $ CTRTRS, XLAENV + $ CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR, + $ CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03, + $ CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Complex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = REAL( A( 1 ) ) * CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B. +* + SRNAMT = 'CLATRS3' + CALL CCOPY( N, X, 1, B, 1 ) + CALL CCOPY( N, X, 1, B( N+1 ), 1 ) + CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from CLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL CSSCAL( N, BIGNUM, X, 1 ) + CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ SUBROUTINE CCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cdrvgt.f b/lapack-netlib/TESTING/LIN/cdrvgt.f index 8d43f640fe..acfbbcfa13 100644 --- a/lapack-netlib/TESTING/LIN/cdrvgt.f +++ b/lapack-netlib/TESTING/LIN/cdrvgt.f @@ -307,16 +307,16 @@ SUBROUTINE CDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, A, AF, IZERO = 0 ELSE IF( IMAT.EQ.8 ) THEN IZERO = 1 - Z( 2 ) = A( N ) + Z( 2 ) = REAL( A( N ) ) A( N ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = A( 1 ) + Z( 3 ) = REAL( A( 1 ) ) A( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N - Z( 1 ) = A( 3*N-2 ) - Z( 2 ) = A( 2*N-1 ) + Z( 1 ) = REAL( A( 3*N-2 ) ) + Z( 2 ) = REAL( A( 2*N-1 ) ) A( 3*N-2 ) = ZERO A( 2*N-1 ) = ZERO ELSE diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index 7fe189e5fd..ecba705d5f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -31,7 +31,8 @@ *> *> \verbatim *> -*> CDRVLS tests the least squares driver routines CGELS, CGETSLS, CGELSS, CGELSY +*> CDRVLS tests the least squares driver routines CGELS, CGELST, +*> CGETSLS, CGELSS, CGELSY *> and CGELSD. *> \endverbatim * @@ -211,7 +212,7 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) REAL ONE, ZERO @@ -228,8 +229,8 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, LRWORK, - $ LWORK_CGELS, LWORK_CGETSLS, LWORK_CGELSS, - $ LWORK_CGELSY, LWORK_CGELSD, + $ LWORK_CGELS, LWORK_CGELST, LWORK_CGETSLS, + $ LWORK_CGELSS, LWORK_CGELSY, LWORK_CGELSD, $ LRWORK_CGELSY, LRWORK_CGELSS, LRWORK_CGELSD REAL EPS, NORMA, NORMB, RCOND * .. @@ -249,7 +250,7 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, CERRLS, CGELS, CGELSD, - $ CGELSS, CGELSY, CGEMM, CGETSLS, CLACPY, + $ CGELSS, CGELST, CGELSY, CGEMM, CGETSLS, CLACPY, $ CLARNV, CQRT13, CQRT15, CQRT16, CSSCAL, $ SAXPY, XLAENV * .. @@ -334,7 +335,8 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -361,6 +363,10 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL CGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_CGELS = INT( WQ( 1 ) ) +* Compute workspace needed for CGELST + CALL CGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_CGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for CGETSLS CALL CGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -425,21 +431,26 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 100 -* +* ===================================================== +* Begin test CGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test CGELS -* * Generate a matrix of scaling type ISCALE * CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -484,15 +495,20 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for CGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, RWORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for CGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN @@ -515,7 +531,7 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -524,26 +540,34 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test CGETSLS + END DO + END DO + END IF +* ===================================================== +* End test CGELS +* ===================================================== +* ===================================================== +* Begin test CGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + CALL XLAENV( 3, NXVAL( INB ) ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -560,9 +584,9 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, IF( NCOLS.GT.0 ) THEN CALL CLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) - CALL CSCAL( NCOLS*NRHS, - $ CONE / REAL( NCOLS ), WORK, - $ 1 ) + CALL CSSCAL( NCOLS*NRHS, + $ ONE / REAL( NCOLS ), WORK, + $ 1 ) END IF CALL CGEMM( TRANS, 'No transpose', NROWS, $ NRHS, NCOLS, CONE, COPYA, LDA, @@ -578,31 +602,37 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'CGETSLS ' - CALL CGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'CGELST' + CALL CGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) +* IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'CGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'CGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for CGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL CLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL CQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK2, - $ RESULT( 15 ) ) + $ LDA, B, LDB, C, LDB, RWORK, + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for CGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * * Solving LS system * - RESULT( 16 ) = CQRT17( TRANS, 1, M, N, + RESULT( 4 ) = CQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -610,7 +640,7 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * Solving overdetermined system * - RESULT( 16 ) = CQRT14( TRANS, M, N, + RESULT( 4 ) = CQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -618,21 +648,151 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 )TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test CGELST +* ===================================================== +* ===================================================== +* Begin test CGELSTSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO INB = 1, NNB + MB = NBVAL( INB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO IMB = 1, NNB + NB = NBVAL( IMB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'C' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL CLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL CSCAL( NCOLS*NRHS, + $ CONE / REAL( NCOLS ), + $ WORK, 1 ) + END IF + CALL CGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, CONE, + $ COPYA, LDA, WORK, LDWORK, + $ CZERO, B, LDB ) + CALL CLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL CLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL CLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'CGETSLS ' + CALL CGETSLS( TRANS, M, N, NRHS, A, + $ LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'CGETSLS ', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for CGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL CLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL CQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK2, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for CGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = CQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = CQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, B, + $ LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 )TRANS, + $ M, N, NRHS, MB, NB, ITYPE, K, + $ RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test CGELSTSLS +* ==================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -680,37 +840,37 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = CQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = CQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK, RWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 4 ) ) + $ RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -736,38 +896,38 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 8 ) ) + $ RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -792,45 +952,45 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL CQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 12 ) ) + $ RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = CQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = CQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = CQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = CQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 80 K = 3, 14 + DO 80 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/cdrvrf3.f b/lapack-netlib/TESTING/LIN/cdrvrf3.f index 1ca816979a..d0edf75e10 100644 --- a/lapack-netlib/TESTING/LIN/cdrvrf3.f +++ b/lapack-netlib/TESTING/LIN/cdrvrf3.f @@ -156,9 +156,10 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, REAL RESULT( NTESTS ) * .. * .. External Functions .. + LOGICAL LSAME REAL SLAMCH, CLANGE COMPLEX CLARND - EXTERNAL SLAMCH, CLARND, CLANGE + EXTERNAL SLAMCH, CLARND, CLANGE, LSAME * .. * .. External Subroutines .. EXTERNAL CTRTTF, CGEQRF, CGEQLF, CTFSM, CTRSM @@ -222,9 +223,9 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO 100 IALPHA = 1, 3 * - IF ( IALPHA.EQ. 1) THEN + IF ( IALPHA.EQ.1 ) THEN ALPHA = ZERO - ELSE IF ( IALPHA.EQ. 2) THEN + ELSE IF ( IALPHA.EQ.2 ) THEN ALPHA = ONE ELSE ALPHA = CLARND( 4, ISEED ) @@ -263,7 +264,7 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, NA DO I = 1, NA - A( I, J) = CLARND( 4, ISEED ) + A( I, J ) = CLARND( 4, ISEED ) END DO END DO * @@ -276,6 +277,20 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL CGEQRF( NA, NA, A, LDA, TAU, + C_WORK_CGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO J = 1, NA + DO I = 1, J + A( I, J ) = A( I, J ) / + + ( 2.0 * A( J, J ) ) + END DO + END DO + END IF +* ELSE * * The case IUPLO.EQ.2 is when SIDE.EQ.'L' @@ -285,6 +300,20 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL CGELQF( NA, NA, A, LDA, TAU, + C_WORK_CGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO I = 1, NA + DO J = 1, I + A( I, J ) = A( I, J ) / + + ( 2.0 * A( I, I ) ) + END DO + END DO + END IF +* END IF * * After the QR factorization, the diagonal @@ -293,7 +322,8 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * value 1.0E+00. * DO J = 1, NA - A( J, J) = A(J,J) * CLARND( 5, ISEED ) + A( J, J ) = A( J, J ) * + + CLARND( 5, ISEED ) END DO * * Store a copy of A in RFP format (in ARF). @@ -307,8 +337,8 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = CLARND( 4, ISEED ) - B2( I, J) = B1( I, J) + B1( I, J ) = CLARND( 4, ISEED ) + B2( I, J ) = B1( I, J ) END DO END DO * @@ -331,24 +361,24 @@ SUBROUTINE CDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = B2( I, J ) - B1( I, J ) + B1( I, J ) = B2( I, J ) - B1( I, J ) END DO END DO * - RESULT(1) = CLANGE( 'I', M, N, B1, LDA, + RESULT( 1 ) = CLANGE( 'I', M, N, B1, LDA, + S_WORK_CLANGE ) * - RESULT(1) = RESULT(1) / SQRT( EPS ) - + / MAX ( MAX( M, N), 1 ) + RESULT( 1 ) = RESULT( 1 ) / SQRT( EPS ) + + / MAX ( MAX( M, N ), 1 ) * - IF( RESULT(1).GE.THRESH ) THEN + IF( RESULT( 1 ).GE.THRESH ) THEN IF( NFAIL.EQ.0 ) THEN WRITE( NOUT, * ) WRITE( NOUT, FMT = 9999 ) END IF WRITE( NOUT, FMT = 9997 ) 'CTFSM', + CFORM, SIDE, UPLO, TRANS, DIAG, M, - + N, RESULT(1) + + N, RESULT( 1 ) NFAIL = NFAIL + 1 END IF * diff --git a/lapack-netlib/TESTING/LIN/cerrls.f b/lapack-netlib/TESTING/LIN/cerrls.f index 48e44ad863..fca9439181 100644 --- a/lapack-netlib/TESTING/LIN/cerrls.f +++ b/lapack-netlib/TESTING/LIN/cerrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> CERRLS tests the error exits for the COMPLEX least squares -*> driver routines (CGELS, CGELSS, CGELSY, CGELSD). +*> driver routines (CGELS, CGELST, CGETSLS, CGELSS, CGELSY, CGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ SUBROUTINE CERRLS( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CGELS, CGELSD, CGELSS, CGELSY, CHKXER + EXTERNAL ALAESM, CHKXER, CGELS, CGELSD, CGELSS, CGELST, + $ CGELSY, CGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ SUBROUTINE CERRLS( PATH, NUNIT ) INFOT = 8 CALL CGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL CGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK ) * +* CGELST +* + SRNAMT = 'CGELST' + INFOT = 1 + CALL CGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK ) +* +* CGETSLS +* + SRNAMT = 'CGETSLS' + INFOT = 1 + CALL CGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK ) +* * CGELSS * SRNAMT = 'CGELSS' diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index db65edd881..9ba784f62a 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -82,9 +82,10 @@ SUBROUTINE CERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, CTBCON, - $ CTBRFS, CTBTRS, CTPCON, CTPRFS, CTPTRI, CTPTRS, - $ CTRCON, CTRRFS, CTRTI2, CTRTRI, CTRTRS + EXTERNAL ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, + $ CLATRS3, CTBCON, CTBRFS, CTBTRS, CTPCON, + $ CTPRFS, CTPTRI, CTPTRS, CTRCON, CTRRFS, CTRTI2, + $ CTRTRI, CTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ SUBROUTINE CERRTR( PATH, NUNIT ) CALL CLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'CLATRS', INFOT, NOUT, LERR, OK ) * +* CLATRS3 +* + SRNAMT = 'CLATRS3' + INFOT = 1 + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/clattp.f b/lapack-netlib/TESTING/LIN/clattp.f index 82f0585dfe..a47a252ada 100644 --- a/lapack-netlib/TESTING/LIN/clattp.f +++ b/lapack-netlib/TESTING/LIN/clattp.f @@ -336,7 +336,7 @@ SUBROUTINE CLATTP( IMAT, UPLO, TRANS, DIAG, ISEED, N, AP, B, WORK, WORK( J+1 ) = PLUS2 WORK( N+J+1 ) = ZERO PLUS1 = STAR1 / PLUS2 - REXP = CLARND( 2, ISEED ) + REXP = REAL( CLARND( 2, ISEED ) ) IF( REXP.LT.ZERO ) THEN STAR1 = -SFAC**( ONE-REXP )*CLARND( 5, ISEED ) ELSE @@ -790,7 +790,7 @@ SUBROUTINE CLATTP( IMAT, UPLO, TRANS, DIAG, ISEED, N, AP, B, WORK, DO 460 J = 1, N / 2 JL = JJ DO 450 I = J, N - J - T = AP( JR-I+J ) + T = REAL( AP( JR-I+J ) ) AP( JR-I+J ) = AP( JL ) AP( JL ) = T JL = JL + I @@ -804,7 +804,7 @@ SUBROUTINE CLATTP( IMAT, UPLO, TRANS, DIAG, ISEED, N, AP, B, WORK, DO 480 J = 1, N / 2 JR = JJ DO 470 I = J, N - J - T = AP( JL+I-J ) + T = REAL( AP( JL+I-J ) ) AP( JL+I-J ) = AP( JR ) AP( JR ) = T JR = JR - I diff --git a/lapack-netlib/TESTING/LIN/cpbt01.f b/lapack-netlib/TESTING/LIN/cpbt01.f index 33c80666dc..6145a18756 100644 --- a/lapack-netlib/TESTING/LIN/cpbt01.f +++ b/lapack-netlib/TESTING/LIN/cpbt01.f @@ -201,7 +201,8 @@ SUBROUTINE CPBT01( UPLO, N, KD, A, LDA, AFAC, LDAFAC, RWORK, * * Compute the (K,K) element of the result. * - AKK = CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) + AKK = REAL( + $ CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) ) AFAC( KD+1, K ) = AKK * * Compute the rest of column K. @@ -228,7 +229,7 @@ SUBROUTINE CPBT01( UPLO, N, KD, A, LDA, AFAC, LDAFAC, RWORK, * * Scale column K by the diagonal element. * - AKK = AFAC( 1, K ) + AKK = REAL( AFAC( 1, K ) ) CALL CSSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 ) * 40 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/cpot01.f b/lapack-netlib/TESTING/LIN/cpot01.f index 00e195dd63..fbcf650862 100644 --- a/lapack-netlib/TESTING/LIN/cpot01.f +++ b/lapack-netlib/TESTING/LIN/cpot01.f @@ -176,7 +176,7 @@ SUBROUTINE CPOT01( UPLO, N, A, LDA, AFAC, LDAFAC, RWORK, RESID ) * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. @@ -224,7 +224,7 @@ SUBROUTINE CPOT01( UPLO, N, A, LDA, AFAC, LDAFAC, RWORK, RESID ) 70 CONTINUE END IF * -* Compute norm( L*U - A ) / ( N * norm(A) * EPS ) +* Compute norm(L*U - A) / ( N * norm(A) * EPS ) * RESID = CLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK ) * diff --git a/lapack-netlib/TESTING/LIN/cppt01.f b/lapack-netlib/TESTING/LIN/cppt01.f index 3a761a4c71..f865ec7794 100644 --- a/lapack-netlib/TESTING/LIN/cppt01.f +++ b/lapack-netlib/TESTING/LIN/cppt01.f @@ -178,7 +178,7 @@ SUBROUTINE CPPT01( UPLO, N, A, AFAC, RWORK, RESID ) * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) + TR = REAL( CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) ) AFAC( KC+K-1 ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/cpst01.f b/lapack-netlib/TESTING/LIN/cpst01.f index 26da4b3943..03d25515da 100644 --- a/lapack-netlib/TESTING/LIN/cpst01.f +++ b/lapack-netlib/TESTING/LIN/cpst01.f @@ -219,7 +219,7 @@ SUBROUTINE CPST01( UPLO, N, A, LDA, AFAC, LDAFAC, PERM, LDPERM, * * Compute the (K,K) element of the result. * - TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/dchktr.f b/lapack-netlib/TESTING/LIN/dchktr.f index a4a1150c09..57e87326b0 100644 --- a/lapack-netlib/TESTING/LIN/dchktr.f +++ b/lapack-netlib/TESTING/LIN/dchktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS +*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -198,13 +198,13 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DLAMCH, DUMMY, RCOND, + $ RCONDC, RCONDI, RCONDO, RES, SCALE * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, DCOPY, DERRTR, DGET04, - $ DLACPY, DLARHS, DLATRS, DLATTR, DTRCON, DTRRFS, - $ DTRT01, DTRT02, DTRT03, DTRT05, DTRT06, DTRTRI, - $ DTRTRS, XLAENV + $ DLACPY, DLAMCH, DSCAL, DLARHS, DLATRS, DLATRS3, + $ DLATTR, DTRCON, DTRRFS, DTRT01, DTRT02, DTRT03, + $ DTRT05, DTRT06, DTRTRI, DTRTRS, XLAENV * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Double precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,32 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'DLATRS3' + CALL DCOPY( N, X, 1, B, 1 ) + CALL DCOPY( N, X, 1, B( N+1 ), 1 ) + CALL DSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL DLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from DLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL DSCAL( N, BIGNUM, X, 1 ) + CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +583,14 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'DLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +603,8 @@ SUBROUTINE DCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/ddrvls.f b/lapack-netlib/TESTING/LIN/ddrvls.f index b64930c10c..b3d07d67f2 100644 --- a/lapack-netlib/TESTING/LIN/ddrvls.f +++ b/lapack-netlib/TESTING/LIN/ddrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> DDRVLS tests the least squares driver routines DGELS, DGETSLS, DGELSS, DGELSY, -*> and DGELSD. +*> DDRVLS tests the least squares driver routines DGELS, DGELST, +*> DGETSLS, DGELSS, DGELSY, and DGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) DOUBLE PRECISION ONE, TWO, ZERO @@ -225,8 +225,8 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, - $ LWORK_DGELS, LWORK_DGETSLS, LWORK_DGELSS, - $ LWORK_DGELSY, LWORK_DGELSD + $ LWORK_DGELS, LWORK_DGELST, LWORK_DGETSLS, + $ LWORK_DGELSS, LWORK_DGELSY, LWORK_DGELSD DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. @@ -243,12 +243,12 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, DERRLS, DGELS, - $ DGELSD, DGELSS, DGELSY, DGEMM, DLACPY, - $ DLARNV, DLASRT, DQRT13, DQRT15, DQRT16, DSCAL, - $ XLAENV + $ DGELSD, DGELSS, DGELST, DGELSY, DGEMM, + $ DGETSLS, DLACPY, DLARNV, DQRT13, DQRT15, + $ DQRT16, DSCAL, XLAENV * .. * .. Intrinsic Functions .. - INTRINSIC DBLE, INT, LOG, MAX, MIN, SQRT + INTRINSIC DBLE, INT, MAX, MIN, SQRT * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -330,7 +330,8 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -357,6 +358,10 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL DGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_DGELS = INT ( WQ ( 1 ) ) +* Compute workspace needed for DGELST + CALL DGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_DGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for DGETSLS CALL DGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -378,9 +383,9 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Compute LIWORK workspace needed for DGELSY and DGELSD LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGETSLS, - $ LWORK_DGELSY, LWORK_DGELSS, - $ LWORK_DGELSD ) + LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGELST, + $ LWORK_DGETSLS, LWORK_DGELSY, + $ LWORK_DGELSS, LWORK_DGELSD ) END IF ENDDO ENDDO @@ -411,21 +416,26 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 110 -* +* ===================================================== +* Begin test DGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test DGELS -* * Generate a matrix of scaling type ISCALE * CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -469,20 +479,27 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for DGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL DQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for DGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * RESULT( 2 ) = DQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, @@ -500,35 +517,42 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9999 )TRANS, M, + WRITE( NOUT, FMT = 9999 ) TRANS, M, $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test DGETSLS + END DO + END DO + END IF +* ===================================================== +* End test DGELS +* ===================================================== +* ===================================================== +* Begin test DGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -563,31 +587,38 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'DGETSLS ' - CALL DGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'DGELST' + CALL DGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'DGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'DGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for DGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL DLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL DQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, - $ RESULT( 15 ) ) + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for DGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * - RESULT( 16 ) = DQRT17( TRANS, 1, M, N, + RESULT( 4 ) = DQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -595,7 +626,7 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * Solving overdetermined system * - RESULT( 16 ) = DQRT14( TRANS, M, N, + RESULT( 4 ) = DQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -603,21 +634,151 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 ) TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test DGELST +* ===================================================== +* ===================================================== +* Begin test DGETSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO IMB = 1, NNB + MB = NBVAL( IMB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'T' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL DLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL DSCAL( NCOLS*NRHS, + $ ONE / DBLE( NCOLS ), + $ WORK, 1 ) + END IF + CALL DGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, ONE, + $ COPYA, LDA, WORK, LDWORK, + $ ZERO, B, LDB ) + CALL DLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL DLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL DLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'DGETSLS' + CALL DGETSLS( TRANS, M, N, NRHS, + $ A, LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'DGETSLS', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for DGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL DLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL DQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for DGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = DQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = DQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, + $ B, LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 ) TRANS, + $ M, N, NRHS, MB, NB, ITYPE, + $ K, RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test DGETSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -662,37 +823,37 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = DQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = DQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 4 ) ) + $ WORK( M*NRHS+1 ), RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -716,38 +877,38 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 8 ) ) + $ WORK( M*NRHS+1 ), RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -776,45 +937,45 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL DQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 12 ) ) + $ WORK( M*NRHS+1 ), RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = DQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = DQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = DQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = DQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 90 K = 3, 14 + DO 90 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -826,6 +987,12 @@ SUBROUTINE DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, NRUN = NRUN + 12 * 100 CONTINUE + + + + + + 110 CONTINUE 120 CONTINUE 130 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/ddrvrf3.f b/lapack-netlib/TESTING/LIN/ddrvrf3.f index 1c5d74aea0..ef823c2e75 100644 --- a/lapack-netlib/TESTING/LIN/ddrvrf3.f +++ b/lapack-netlib/TESTING/LIN/ddrvrf3.f @@ -153,8 +153,9 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Functions .. + LOGICAL LSAME DOUBLE PRECISION DLAMCH, DLANGE, DLARND - EXTERNAL DLAMCH, DLANGE, DLARND + EXTERNAL DLAMCH, DLANGE, DLARND, LSAME * .. * .. External Subroutines .. EXTERNAL DTRTTF, DGEQRF, DGEQLF, DTFSM, DTRSM @@ -218,9 +219,9 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO 100 IALPHA = 1, 3 * - IF ( IALPHA.EQ. 1) THEN + IF ( IALPHA.EQ.1 ) THEN ALPHA = ZERO - ELSE IF ( IALPHA.EQ. 2) THEN + ELSE IF ( IALPHA.EQ.2 ) THEN ALPHA = ONE ELSE ALPHA = DLARND( 2, ISEED ) @@ -259,7 +260,7 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, NA DO I = 1, NA - A( I, J) = DLARND( 2, ISEED ) + A( I, J ) = DLARND( 2, ISEED ) END DO END DO * @@ -272,6 +273,20 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL DGEQRF( NA, NA, A, LDA, TAU, + D_WORK_DGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO J = 1, NA + DO I = 1, J + A( I, J ) = A( I, J ) / + + ( 2.0 * A( J, J ) ) + END DO + END DO + END IF +* ELSE * * The case IUPLO.EQ.2 is when SIDE.EQ.'L' @@ -281,6 +296,20 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL DGELQF( NA, NA, A, LDA, TAU, + D_WORK_DGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO I = 1, NA + DO J = 1, I + A( I, J ) = A( I, J ) / + + ( 2.0 * A( I, I ) ) + END DO + END DO + END IF +* END IF * * Store a copy of A in RFP format (in ARF). @@ -294,8 +323,8 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = DLARND( 2, ISEED ) - B2( I, J) = B1( I, J) + B1( I, J ) = DLARND( 2, ISEED ) + B2( I, J ) = B1( I, J ) END DO END DO * @@ -318,24 +347,24 @@ SUBROUTINE DDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = B2( I, J ) - B1( I, J ) + B1( I, J ) = B2( I, J ) - B1( I, J ) END DO END DO * - RESULT(1) = DLANGE( 'I', M, N, B1, LDA, + RESULT( 1 ) = DLANGE( 'I', M, N, B1, LDA, + D_WORK_DLANGE ) * - RESULT(1) = RESULT(1) / SQRT( EPS ) - + / MAX ( MAX( M, N), 1 ) + RESULT( 1 ) = RESULT( 1 ) / SQRT( EPS ) + + / MAX ( MAX( M, N ), 1 ) * - IF( RESULT(1).GE.THRESH ) THEN + IF( RESULT( 1 ).GE.THRESH ) THEN IF( NFAIL.EQ.0 ) THEN WRITE( NOUT, * ) WRITE( NOUT, FMT = 9999 ) END IF WRITE( NOUT, FMT = 9997 ) 'DTFSM', + CFORM, SIDE, UPLO, TRANS, DIAG, M, - + N, RESULT(1) + + N, RESULT( 1 ) NFAIL = NFAIL + 1 END IF * diff --git a/lapack-netlib/TESTING/LIN/derrls.f b/lapack-netlib/TESTING/LIN/derrls.f index a1f74dec23..09d745238e 100644 --- a/lapack-netlib/TESTING/LIN/derrls.f +++ b/lapack-netlib/TESTING/LIN/derrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> DERRLS tests the error exits for the DOUBLE PRECISION least squares -*> driver routines (DGELS, SGELSS, SGELSY, SGELSD). +*> driver routines (DGELS, DGELST, DGETSLS, SGELSS, SGELSY, SGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ SUBROUTINE DERRLS( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELSY + EXTERNAL ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELST, + $ DGELSY, DGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ SUBROUTINE DERRLS( PATH, NUNIT ) INFOT = 8 CALL DGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL DGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK ) * +* DGELST +* + SRNAMT = 'DGELST' + INFOT = 1 + CALL DGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK ) +* +* DGETSLS +* + SRNAMT = 'DGETSLS' + INFOT = 1 + CALL DGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK ) +* * DGELSS * SRNAMT = 'DGELSS' diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index a667f0d2b8..d0580497da 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -83,9 +83,10 @@ SUBROUTINE DERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, DTBCON, - $ DTBRFS, DTBTRS, DTPCON, DTPRFS, DTPTRI, DTPTRS, - $ DTRCON, DTRRFS, DTRTI2, DTRTRI, DTRTRS + EXTERNAL ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, + $ DLATRS3, DTBCON, DTBRFS, DTBTRS, DTPCON, + $ DTPRFS, DTPTRI, DTPTRS, DTRCON, DTRRFS, + $ DTRTI2, DTRTRI, DTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ SUBROUTINE DERRTR( PATH, NUNIT ) INFOT = 7 CALL DLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'DLATRS', INFOT, NOUT, LERR, OK ) +* +* DLATRS3 +* + SRNAMT = 'DLATRS3' + INFOT = 1 + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f index 66fa0bee7f..92d8761087 100644 --- a/lapack-netlib/TESTING/LIN/schktr.f +++ b/lapack-netlib/TESTING/LIN/schktr.f @@ -30,7 +30,7 @@ *> *> \verbatim *> -*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS +*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS(3) *> \endverbatim * * Arguments: @@ -187,7 +187,7 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) REAL ONE, ZERO @@ -198,13 +198,13 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - REAL AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + REAL AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, SLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ) + REAL RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -213,9 +213,9 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASUM, SCOPY, SERRTR, SGET04, - $ SLACPY, SLARHS, SLATRS, SLATTR, STRCON, STRRFS, - $ STRT01, STRT02, STRT03, STRT05, STRT06, STRTRI, - $ STRTRS, XLAENV + $ SLACPY, SLARHS, SLATRS, SLATRS3, SLATTR, SSCAL, + $ STRCON, STRRFS, STRT01, STRT02, STRT03, STRT05, + $ STRT06, STRTRI, STRTRS, XLAENV, SLAMCH * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -239,6 +239,7 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Single precision' PATH( 2: 3 ) = 'TR' + BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -539,6 +540,33 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'SLATRS3' + CALL SCOPY( N, X, 1, B, 1 ) + CALL SCOPY( N, X, 1, B( N+1 ), 1 ) + CALL SSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL SLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from SLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) +* + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL SSCAL( N, BIGNUM, X, 1 ) + CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -556,7 +584,14 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'SLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -569,8 +604,8 @@ SUBROUTINE SCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/sdrvls.f b/lapack-netlib/TESTING/LIN/sdrvls.f index b964515037..2baf9a3fb1 100644 --- a/lapack-netlib/TESTING/LIN/sdrvls.f +++ b/lapack-netlib/TESTING/LIN/sdrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> SDRVLS tests the least squares driver routines SGELS, SGETSLS, SGELSS, SGELSY, -*> and SGELSD. +*> SDRVLS tests the least squares driver routines SGELS, SGELST, +*> SGETSLS, SGELSS, SGELSY and SGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) REAL ONE, TWO, ZERO @@ -225,8 +225,8 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, - $ LWORK_SGELS, LWORK_SGETSLS, LWORK_SGELSS, - $ LWORK_SGELSY, LWORK_SGELSD + $ LWORK_SGELS, LWORK_SGELST, LWORK_SGETSLS, + $ LWORK_SGELSS, LWORK_SGELSY, LWORK_SGELSD REAL EPS, NORMA, NORMB, RCOND * .. * .. Local Arrays .. @@ -243,12 +243,12 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * .. * .. External Subroutines .. EXTERNAL ALAERH, ALAHD, ALASVM, SAXPY, SERRLS, SGELS, - $ SGELSD, SGELSS, SGELSY, SGEMM, SLACPY, - $ SLARNV, SQRT13, SQRT15, SQRT16, SSCAL, - $ XLAENV, SGETSLS + $ SGELSD, SGELSS, SGELST, SGELSY, SGEMM, + $ SGETSLS, SLACPY, SLARNV, SQRT13, SQRT15, + $ SQRT16, SSCAL, XLAENV * .. * .. Intrinsic Functions .. - INTRINSIC INT, LOG, MAX, MIN, REAL, SQRT + INTRINSIC INT, MAX, MIN, REAL, SQRT * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -330,7 +330,8 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -357,6 +358,10 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL SGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ( 1 ), -1, INFO ) LWORK_SGELS = INT ( WQ( 1 ) ) +* Compute workspace needed for SGELST + CALL SGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_SGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for SGETSLS CALL SGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ( 1 ), -1, INFO ) @@ -378,9 +383,9 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Compute LIWORK workspace needed for SGELSY and SGELSD LIWORK = MAX( LIWORK, N, IWQ( 1 ) ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGETSLS, - $ LWORK_SGELSY, LWORK_SGELSS, - $ LWORK_SGELSD ) + LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGELST, + $ LWORK_SGETSLS, LWORK_SGELSY, + $ LWORK_SGELSS, LWORK_SGELSD ) END IF ENDDO ENDDO @@ -411,21 +416,26 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 110 -* +* ===================================================== +* Begin test SGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test SGELS -* * Generate a matrix of scaling type ISCALE * CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -469,20 +479,27 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for SGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL SQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for SGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * RESULT( 2 ) = SQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, @@ -500,7 +517,7 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -509,26 +526,33 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test SGETSLS + END DO + END DO + END IF +* ===================================================== +* End test SGELS +* ===================================================== +* ===================================================== +* Begin test SGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) -* - DO 60 ITRAN = 1, 2 +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -563,31 +587,38 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'SGETSLS ' - CALL SGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'SGELST' + CALL SGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'SGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'SGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for SGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL SLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL SQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, WORK, - $ RESULT( 15 ) ) + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for SGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * -* Solving LS system +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) * - RESULT( 16 ) = SQRT17( TRANS, 1, M, N, + RESULT( 4 ) = SQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -595,7 +626,7 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * Solving overdetermined system * - RESULT( 16 ) = SQRT14( TRANS, M, N, + RESULT( 4 ) = SQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -603,21 +634,151 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 ) TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO END IF +* ===================================================== +* End test SGELST +* ===================================================== +* ===================================================== +* Begin test SGETSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO IMB = 1, NNB + MB = NBVAL( IMB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'T' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL SLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL SSCAL( NCOLS*NRHS, + $ ONE / REAL( NCOLS ), + $ WORK, 1 ) + END IF + CALL SGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, ONE, + $ COPYA, LDA, WORK, LDWORK, + $ ZERO, B, LDB ) + CALL SLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL SLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL SLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'SGETSLS' + CALL SGETSLS( TRANS, M, N, NRHS, + $ A, LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'SGETSLS', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for SGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL SLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL SQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for SGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = SQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = SQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, + $ B, LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 ) TRANS, + $ M, N, NRHS, MB, NB, ITYPE, + $ K, RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO + END IF +* ===================================================== +* End test SGETSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -662,37 +823,37 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = SQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = SQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 4 ) ) + $ WORK( M*NRHS+1 ), RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -716,38 +877,38 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 8 ) ) + $ WORK( M*NRHS+1 ), RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -776,45 +937,45 @@ SUBROUTINE SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = SASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = SASUM( MNMIN, S, 1 ) / $ SASUM( MNMIN, COPYS, 1 ) / $ ( EPS*REAL( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL SQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, - $ WORK( M*NRHS+1 ), RESULT( 12 ) ) + $ WORK( M*NRHS+1 ), RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = SQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = SQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = SQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = SQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 90 K = 3, 14 + DO 90 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/sdrvrf3.f b/lapack-netlib/TESTING/LIN/sdrvrf3.f index 5faae27337..bc01d8473b 100644 --- a/lapack-netlib/TESTING/LIN/sdrvrf3.f +++ b/lapack-netlib/TESTING/LIN/sdrvrf3.f @@ -153,8 +153,9 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, REAL RESULT( NTESTS ) * .. * .. External Functions .. + LOGICAL LSAME REAL SLAMCH, SLANGE, SLARND - EXTERNAL SLAMCH, SLANGE, SLARND + EXTERNAL SLAMCH, SLANGE, SLARND, LSAME * .. * .. External Subroutines .. EXTERNAL STRTTF, SGEQRF, SGEQLF, STFSM, STRSM @@ -218,9 +219,9 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO 100 IALPHA = 1, 3 * - IF ( IALPHA.EQ. 1) THEN + IF ( IALPHA.EQ.1 ) THEN ALPHA = ZERO - ELSE IF ( IALPHA.EQ. 2) THEN + ELSE IF ( IALPHA.EQ.2 ) THEN ALPHA = ONE ELSE ALPHA = SLARND( 2, ISEED ) @@ -259,7 +260,7 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, NA DO I = 1, NA - A( I, J) = SLARND( 2, ISEED ) + A( I, J ) = SLARND( 2, ISEED ) END DO END DO * @@ -272,6 +273,20 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL SGEQRF( NA, NA, A, LDA, TAU, + S_WORK_SGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO J = 1, NA + DO I = 1, J + A( I, J ) = A( I, J ) / + + ( 2.0 * A( J, J ) ) + END DO + END DO + END IF +* ELSE * * The case IUPLO.EQ.2 is when SIDE.EQ.'L' @@ -281,6 +296,20 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL SGELQF( NA, NA, A, LDA, TAU, + S_WORK_SGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO I = 1, NA + DO J = 1, I + A( I, J ) = A( I, J ) / + + ( 2.0 * A( I, I ) ) + END DO + END DO + END IF +* END IF * * Store a copy of A in RFP format (in ARF). @@ -294,8 +323,8 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = SLARND( 2, ISEED ) - B2( I, J) = B1( I, J) + B1( I, J ) = SLARND( 2, ISEED ) + B2( I, J ) = B1( I, J ) END DO END DO * @@ -318,24 +347,24 @@ SUBROUTINE SDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = B2( I, J ) - B1( I, J ) + B1( I, J ) = B2( I, J ) - B1( I, J ) END DO END DO * - RESULT(1) = SLANGE( 'I', M, N, B1, LDA, + RESULT( 1 ) = SLANGE( 'I', M, N, B1, LDA, + S_WORK_SLANGE ) * - RESULT(1) = RESULT(1) / SQRT( EPS ) - + / MAX ( MAX( M, N), 1 ) + RESULT( 1 ) = RESULT( 1 ) / SQRT( EPS ) + + / MAX ( MAX( M, N ), 1 ) * - IF( RESULT(1).GE.THRESH ) THEN + IF( RESULT( 1 ).GE.THRESH ) THEN IF( NFAIL.EQ.0 ) THEN WRITE( NOUT, * ) WRITE( NOUT, FMT = 9999 ) END IF WRITE( NOUT, FMT = 9997 ) 'STFSM', + CFORM, SIDE, UPLO, TRANS, DIAG, M, - + N, RESULT(1) + + N, RESULT( 1 ) NFAIL = NFAIL + 1 END IF * diff --git a/lapack-netlib/TESTING/LIN/serrls.f b/lapack-netlib/TESTING/LIN/serrls.f index e6ee4360f9..6c4820066a 100644 --- a/lapack-netlib/TESTING/LIN/serrls.f +++ b/lapack-netlib/TESTING/LIN/serrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> SERRLS tests the error exits for the REAL least squares -*> driver routines (SGELS, SGELSS, SGELSY, SGELSD). +*> driver routines (SGELS, SGELST, SGETSLS, SGELSS, SGELSY, SGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ SUBROUTINE SERRLS( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELSY + EXTERNAL ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELST, + $ SGELSY, SGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ SUBROUTINE SERRLS( PATH, NUNIT ) INFOT = 8 CALL SGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL SGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK ) * +* SGELST +* + SRNAMT = 'SGELST' + INFOT = 1 + CALL SGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK ) +* +* SGETSLS +* + SRNAMT = 'SGETSLS' + INFOT = 1 + CALL SGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK ) +* * SGELSS * SRNAMT = 'SGELSS' diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index f0d0a0ef21..af1ce0a8e3 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -83,9 +83,10 @@ SUBROUTINE SERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, STBCON, - $ STBRFS, STBTRS, STPCON, STPRFS, STPTRI, STPTRS, - $ STRCON, STRRFS, STRTI2, STRTRI, STRTRS + EXTERNAL ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, + $ SLATRS3, STBCON, STBRFS, STBTRS, STPCON, + $ STPRFS, STPTRI, STPTRS, STRCON, STRRFS, STRTI2, + $ STRTRI, STRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -244,6 +245,46 @@ SUBROUTINE SERRTR( PATH, NUNIT ) INFOT = 7 CALL SLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO ) CALL CHKXER( 'SLATRS', INFOT, NOUT, LERR, OK ) +* +* SLATRS3 +* + SRNAMT = 'SLATRS3' + INFOT = 1 + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, + $ W( 2 ), 1, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, + $ W( 2 ), 0, INFO ) + CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN * diff --git a/lapack-netlib/TESTING/LIN/zchkpt.f b/lapack-netlib/TESTING/LIN/zchkpt.f index 80e1690a7e..11089d2a1a 100644 --- a/lapack-netlib/TESTING/LIN/zchkpt.f +++ b/lapack-netlib/TESTING/LIN/zchkpt.f @@ -319,15 +319,15 @@ SUBROUTINE ZCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, * elements. * IF( IZERO.EQ.1 ) THEN - D( 1 ) = Z( 2 ) + D( 1 ) = DBLE( Z( 2 ) ) IF( N.GT.1 ) $ E( 1 ) = Z( 3 ) ELSE IF( IZERO.EQ.N ) THEN E( N-1 ) = Z( 1 ) - D( N ) = Z( 2 ) + D( N ) = DBLE( Z( 2 ) ) ELSE E( IZERO-1 ) = Z( 1 ) - D( IZERO ) = Z( 2 ) + D( IZERO ) = DBLE( Z( 2 ) ) E( IZERO ) = Z( 3 ) END IF END IF diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 0a6f47b1ea..275ca28570 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -31,7 +31,7 @@ *> *> \verbatim *> -*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS +*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3) *> \endverbatim * * Arguments: @@ -184,7 +184,7 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, INTEGER NTYPE1, NTYPES PARAMETER ( NTYPE1 = 10, NTYPES = 18 ) INTEGER NTESTS - PARAMETER ( NTESTS = 9 ) + PARAMETER ( NTESTS = 10 ) INTEGER NTRAN PARAMETER ( NTRAN = 3 ) DOUBLE PRECISION ONE, ZERO @@ -195,13 +195,13 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, CHARACTER*3 PATH INTEGER I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN, $ IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN - DOUBLE PRECISION AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI, - $ RCONDO, SCALE + DOUBLE PRECISION AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC, + $ RCONDI, RCONDO, RES, SCALE, DLAMCH * .. * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ) + DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -209,10 +209,10 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, EXTERNAL LSAME, ZLANTR * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR, - $ ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON, - $ ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06, - $ ZTRTRI, ZTRTRS + EXTERNAL ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY, + $ ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS, + $ ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01, + $ ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -236,6 +236,7 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * PATH( 1: 1 ) = 'Zomplex precision' PATH( 2: 3 ) = 'TR' + BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision') NRUN = 0 NFAIL = 0 NERRS = 0 @@ -380,7 +381,7 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, * This line is needed on a Sun SPARCstation. * IF( N.GT.0 ) - $ DUMMY = A( 1 ) + $ DUMMY = DBLE( A( 1 ) ) * CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA, $ X, LDA, B, LDA, WORK, RWORK, @@ -535,6 +536,32 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK, $ RESULT( 9 ) ) * +*+ TEST 10 +* Solve op(A)*X = B +* + SRNAMT = 'ZLATRS3' + CALL ZCOPY( N, X, 1, B, 1 ) + CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) + CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, + $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, + $ INFO ) +* +* Check error code from ZLATRS3. +* + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZLATRS3', INFO, 0, + $ UPLO // TRANS // DIAG // 'N', N, N, + $ -1, -1, -1, IMAT, NFAIL, NERRS, NOUT ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA, + $ X, LDA, WORK, RESULT( 10 ) ) + CALL ZDSCAL( N, BIGNUM, X, 1 ) + CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA, + $ SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA, + $ X, LDA, WORK, RES ) + RESULT( 10 ) = MAX( RESULT( 10 ), RES ) +* * Print information about the tests that did not pass * the threshold. * @@ -552,7 +579,14 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, $ DIAG, 'Y', N, IMAT, 9, RESULT( 9 ) NFAIL = NFAIL + 1 END IF - NRUN = NRUN + 2 + IF( RESULT( 10 ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS, + $ DIAG, 'N', N, IMAT, 10, RESULT( 10 ) + NFAIL = NFAIL + 1 + END IF + NRUN = NRUN + 3 90 CONTINUE 100 CONTINUE 110 CONTINUE @@ -565,8 +599,8 @@ SUBROUTINE ZCHKTR( DOTYPE, NN, NVAL, NNB, NBVAL, NNS, NSVAL, 9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=', $ I4, ', type ', I2, ', test(', I2, ')= ', G12.5 ) 9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1, - $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', - $ test(', I2, ')= ', G12.5 ) + $ ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(', + $ I2, ')= ', G12.5 ) 9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',', $ 11X, ' type ', I2, ', test(', I2, ')=', G12.5 ) 9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''', diff --git a/lapack-netlib/TESTING/LIN/zdrvgt.f b/lapack-netlib/TESTING/LIN/zdrvgt.f index d055e4bdb2..b2e0f66b12 100644 --- a/lapack-netlib/TESTING/LIN/zdrvgt.f +++ b/lapack-netlib/TESTING/LIN/zdrvgt.f @@ -307,16 +307,16 @@ SUBROUTINE ZDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, A, AF, IZERO = 0 ELSE IF( IMAT.EQ.8 ) THEN IZERO = 1 - Z( 2 ) = A( N ) + Z( 2 ) = DBLE( A( N ) ) A( N ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = A( 1 ) + Z( 3 ) = DBLE( A( 1 ) ) A( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N - Z( 1 ) = A( 3*N-2 ) - Z( 2 ) = A( 2*N-1 ) + Z( 1 ) = DBLE( A( 3*N-2 ) ) + Z( 2 ) = DBLE( A( 2*N-1 ) ) A( 3*N-2 ) = ZERO A( 2*N-1 ) = ZERO ELSE diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 2eab979054..b21345d302 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -31,8 +31,8 @@ *> *> \verbatim *> -*> ZDRVLS tests the least squares driver routines ZGELS, ZGETSLS, ZGELSS, ZGELSY -*> and ZGELSD. +*> ZDRVLS tests the least squares driver routines ZGELS, ZGELST, +*> ZGETSLS, ZGELSS, ZGELSY and ZGELSD. *> \endverbatim * * Arguments: @@ -211,7 +211,7 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * .. Parameters .. INTEGER NTESTS - PARAMETER ( NTESTS = 16 ) + PARAMETER ( NTESTS = 18 ) INTEGER SMLSIZ PARAMETER ( SMLSIZ = 25 ) DOUBLE PRECISION ONE, ZERO @@ -228,8 +228,8 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS, $ NFAIL, NRHS, NROWS, NRUN, RANK, MB, $ MMAX, NMAX, NSMAX, LIWORK, LRWORK, - $ LWORK_ZGELS, LWORK_ZGETSLS, LWORK_ZGELSS, - $ LWORK_ZGELSY, LWORK_ZGELSD, + $ LWORK_ZGELS, LWORK_ZGELST, LWORK_ZGETSLS, + $ LWORK_ZGELSS, LWORK_ZGELSY, LWORK_ZGELSD, $ LRWORK_ZGELSY, LRWORK_ZGELSS, LRWORK_ZGELSD DOUBLE PRECISION EPS, NORMA, NORMB, RCOND * .. @@ -248,10 +248,10 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, EXTERNAL DASUM, DLAMCH, ZQRT12, ZQRT14, ZQRT17 * .. * .. External Subroutines .. - EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, DLASRT, XLAENV, - $ ZDSCAL, ZERRLS, ZGELS, ZGELSD, ZGELSS, - $ ZGELSY, ZGEMM, ZLACPY, ZLARNV, ZQRT13, ZQRT15, - $ ZQRT16, ZGETSLS + EXTERNAL ALAERH, ALAHD, ALASVM, DAXPY, ZERRLS, ZGELS, + $ ZGELSD, ZGELSS, ZGELST, ZGELSY, ZGEMM, + $ ZGETSLS, ZLACPY, ZLARNV, ZQRT13, ZQRT15, + $ ZQRT16, ZDSCAL, XLAENV * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN, INT, SQRT @@ -334,7 +334,8 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, LIWORK = 1 * * Iterate through all test cases and compute necessary workspace -* sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines. +* sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD +* routines. * DO IM = 1, NM M = MVAL( IM ) @@ -361,6 +362,10 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL ZGELS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) LWORK_ZGELS = INT ( WQ( 1 ) ) +* Compute workspace needed for ZGELST + CALL ZGELST( TRANS, M, N, NRHS, A, LDA, + $ B, LDB, WQ, -1, INFO ) + LWORK_ZGELST = INT ( WQ ( 1 ) ) * Compute workspace needed for ZGETSLS CALL ZGETSLS( TRANS, M, N, NRHS, A, LDA, $ B, LDB, WQ, -1, INFO ) @@ -390,9 +395,9 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, LRWORK = MAX( LRWORK, LRWORK_ZGELSY, $ LRWORK_ZGELSS, LRWORK_ZGELSD ) * Compute LWORK workspace needed for all functions - LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGETSLS, - $ LWORK_ZGELSY, LWORK_ZGELSS, - $ LWORK_ZGELSD ) + LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGELST, + $ LWORK_ZGETSLS, LWORK_ZGELSY, + $ LWORK_ZGELSS, LWORK_ZGELSD ) END IF ENDDO ENDDO @@ -425,21 +430,26 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, ITYPE = ( IRANK-1 )*3 + ISCALE IF( .NOT.DOTYPE( ITYPE ) ) $ GO TO 100 -* +* ===================================================== +* Begin test ZGELS +* ===================================================== IF( IRANK.EQ.1 ) THEN * -* Test ZGELS -* * Generate a matrix of scaling type ISCALE * CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 40 INB = 1, NNB +* +* Loop for testing different block sizes. +* + DO INB = 1, NNB NB = NBVAL( INB ) CALL XLAENV( 1, NB ) CALL XLAENV( 3, NXVAL( INB ) ) * - DO 30 ITRAN = 1, 2 +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -484,15 +494,20 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 1: Check correctness of results +* for ZGELS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, $ LDA, B, LDB, C, LDB, RWORK, $ RESULT( 1 ) ) +* +* Test 2: Check correctness of results +* for ZGELS. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN @@ -515,7 +530,7 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 20 K = 1, 2 + DO K = 1, 2 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) @@ -524,26 +539,34 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 20 CONTINUE + END DO NRUN = NRUN + 2 - 30 CONTINUE - 40 CONTINUE -* -* -* Test ZGETSLS + END DO + END DO + END IF +* ===================================================== +* End test ZGELS +* ===================================================== +* ===================================================== +* Begin test ZGELST +* ===================================================== + IF( IRANK.EQ.1 ) THEN * * Generate a matrix of scaling type ISCALE * CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, $ ISEED ) - DO 65 INB = 1, NNB - MB = NBVAL( INB ) - CALL XLAENV( 1, MB ) - DO 62 IMB = 1, NNB - NB = NBVAL( IMB ) - CALL XLAENV( 2, NB ) * - DO 60 ITRAN = 1, 2 +* Loop for testing different block sizes. +* + DO INB = 1, NNB + NB = NBVAL( INB ) + CALL XLAENV( 1, NB ) + CALL XLAENV( 3, NXVAL( INB ) ) +* +* Loop for testing non-transposed and transposed. +* + DO ITRAN = 1, 2 IF( ITRAN.EQ.1 ) THEN TRANS = 'N' NROWS = M @@ -560,9 +583,9 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, IF( NCOLS.GT.0 ) THEN CALL ZLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) - CALL ZSCAL( NCOLS*NRHS, - $ CONE / DBLE( NCOLS ), WORK, - $ 1 ) + CALL ZDSCAL( NCOLS*NRHS, + $ ONE / DBLE( NCOLS ), WORK, + $ 1 ) END IF CALL ZGEMM( TRANS, 'No transpose', NROWS, $ NRHS, NCOLS, CONE, COPYA, LDA, @@ -578,31 +601,37 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, B, LDB ) END IF - SRNAMT = 'ZGETSLS ' - CALL ZGETSLS( TRANS, M, N, NRHS, A, - $ LDA, B, LDB, WORK, LWORK, INFO ) + SRNAMT = 'ZGELST' + CALL ZGELST( TRANS, M, N, NRHS, A, LDA, B, + $ LDB, WORK, LWORK, INFO ) +* IF( INFO.NE.0 ) - $ CALL ALAERH( PATH, 'ZGETSLS ', INFO, 0, + $ CALL ALAERH( PATH, 'ZGELST', INFO, 0, $ TRANS, M, N, NRHS, -1, NB, $ ITYPE, NFAIL, NERRS, $ NOUT ) * -* Check correctness of results +* Test 3: Check correctness of results +* for ZGELST, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) * - LDWORK = MAX( 1, NROWS ) IF( NROWS.GT.0 .AND. NRHS.GT.0 ) $ CALL ZLACPY( 'Full', NROWS, NRHS, $ COPYB, LDB, C, LDB ) CALL ZQRT16( TRANS, M, N, NRHS, COPYA, - $ LDA, B, LDB, C, LDB, WORK2, - $ RESULT( 15 ) ) + $ LDA, B, LDB, C, LDB, RWORK, + $ RESULT( 3 ) ) +* +* Test 4: Check correctness of results +* for ZGELST. * IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN * * Solving LS system * - RESULT( 16 ) = ZQRT17( TRANS, 1, M, N, + RESULT( 4 ) = ZQRT17( TRANS, 1, M, N, $ NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, $ LWORK ) @@ -610,7 +639,7 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * Solving overdetermined system * - RESULT( 16 ) = ZQRT14( TRANS, M, N, + RESULT( 4 ) = ZQRT14( TRANS, M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) END IF @@ -618,21 +647,151 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * Print information about the tests that * did not pass the threshold. * - DO 50 K = 15, 16 + DO K = 3, 4 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) - WRITE( NOUT, FMT = 9997 )TRANS, M, - $ N, NRHS, MB, NB, ITYPE, K, + WRITE( NOUT, FMT = 9999 )TRANS, M, + $ N, NRHS, NB, ITYPE, K, $ RESULT( K ) NFAIL = NFAIL + 1 END IF - 50 CONTINUE + END DO NRUN = NRUN + 2 - 60 CONTINUE - 62 CONTINUE - 65 CONTINUE + END DO + END DO + END IF +* ===================================================== +* End test ZGELST +* ===================================================== +* ===================================================== +* Begin test ZGELSTSLS +* ===================================================== + IF( IRANK.EQ.1 ) THEN +* +* Generate a matrix of scaling type ISCALE +* + CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA, + $ ISEED ) +* +* Loop for testing different block sizes MB. +* + DO INB = 1, NNB + MB = NBVAL( INB ) + CALL XLAENV( 1, MB ) +* +* Loop for testing different block sizes NB. +* + DO IMB = 1, NNB + NB = NBVAL( IMB ) + CALL XLAENV( 2, NB ) +* +* Loop for testing non-transposed +* and transposed. +* + DO ITRAN = 1, 2 + IF( ITRAN.EQ.1 ) THEN + TRANS = 'N' + NROWS = M + NCOLS = N + ELSE + TRANS = 'C' + NROWS = N + NCOLS = M + END IF + LDWORK = MAX( 1, NCOLS ) +* +* Set up a consistent rhs +* + IF( NCOLS.GT.0 ) THEN + CALL ZLARNV( 2, ISEED, NCOLS*NRHS, + $ WORK ) + CALL ZSCAL( NCOLS*NRHS, + $ CONE / DBLE( NCOLS ), + $ WORK, 1 ) + END IF + CALL ZGEMM( TRANS, 'No transpose', + $ NROWS, NRHS, NCOLS, CONE, + $ COPYA, LDA, WORK, LDWORK, + $ CZERO, B, LDB ) + CALL ZLACPY( 'Full', NROWS, NRHS, + $ B, LDB, COPYB, LDB ) +* +* Solve LS or overdetermined system +* + IF( M.GT.0 .AND. N.GT.0 ) THEN + CALL ZLACPY( 'Full', M, N, + $ COPYA, LDA, A, LDA ) + CALL ZLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, B, LDB ) + END IF + SRNAMT = 'ZGETSLS ' + CALL ZGETSLS( TRANS, M, N, NRHS, A, + $ LDA, B, LDB, WORK, LWORK, + $ INFO ) + IF( INFO.NE.0 ) + $ CALL ALAERH( PATH, 'ZGETSLS ', INFO, + $ 0, TRANS, M, N, NRHS, + $ -1, NB, ITYPE, NFAIL, + $ NERRS, NOUT ) +* +* Test 5: Check correctness of results +* for ZGETSLS, compute the residual: +* RESID = norm(B - A*X) / +* / ( max(m,n) * norm(A) * norm(X) * EPS ) +* + IF( NROWS.GT.0 .AND. NRHS.GT.0 ) + $ CALL ZLACPY( 'Full', NROWS, NRHS, + $ COPYB, LDB, C, LDB ) + CALL ZQRT16( TRANS, M, N, NRHS, + $ COPYA, LDA, B, LDB, + $ C, LDB, WORK2, + $ RESULT( 5 ) ) +* +* Test 6: Check correctness of results +* for ZGETSLS. +* + IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR. + $ ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN +* +* Solving LS system, compute: +* r = norm((B- A*X)**T * A) / +* / (norm(A)*norm(B)*max(M,N,NRHS)*EPS) +* + RESULT( 6 ) = ZQRT17( TRANS, 1, M, + $ N, NRHS, COPYA, LDA, + $ B, LDB, COPYB, LDB, + $ C, WORK, LWORK ) + ELSE +* +* Solving overdetermined system +* + RESULT( 6 ) = ZQRT14( TRANS, M, N, + $ NRHS, COPYA, LDA, B, + $ LDB, WORK, LWORK ) + END IF +* +* Print information about the tests that +* did not pass the threshold. +* + DO K = 5, 6 + IF( RESULT( K ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9997 )TRANS, + $ M, N, NRHS, MB, NB, ITYPE, K, + $ RESULT( K ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + 2 + END DO + END DO + END DO END IF +* ===================================================== +* End test ZGELSTSLS +* ===================================================== * * Generate a matrix of scaling type ISCALE and rank * type IRANK. @@ -680,37 +839,37 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * * workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS) * -* Test 3: Compute relative error in svd +* Test 7: Compute relative error in svd * workspace: M*N + 4*MIN(M,N) + MAX(M,N) * - RESULT( 3 ) = ZQRT12( CRANK, CRANK, A, LDA, + RESULT( 7 ) = ZQRT12( CRANK, CRANK, A, LDA, $ COPYS, WORK, LWORK, RWORK ) * -* Test 4: Compute error in solution +* Test 8: Compute error in solution * workspace: M*NRHS + M * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 4 ) ) + $ RESULT( 8 ) ) * -* Test 5: Check norm of r'*A +* Test 9: Check norm of r'*A * workspace: NRHS*(M+N) * - RESULT( 5 ) = ZERO + RESULT( 9 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 5 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 9 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 6: Check if x is in the rowspace of A +* Test 10: Check if x is in the rowspace of A * workspace: (M+NRHS)*(N+2) * - RESULT( 6 ) = ZERO + RESULT( 10 ) = ZERO * IF( N.GT.CRANK ) - $ RESULT( 6 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 10 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -736,38 +895,38 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, * workspace used: 3*min(m,n) + * max(2*min(m,n),nrhs,max(m,n)) * -* Test 7: Compute relative error in svd +* Test 11: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 7 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 7 ) = ZERO + RESULT( 11 ) = ZERO END IF * -* Test 8: Compute error in solution +* Test 12: Compute error in solution * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 8 ) ) + $ RESULT( 12 ) ) * -* Test 9: Check norm of r'*A +* Test 13: Check norm of r'*A * - RESULT( 9 ) = ZERO + RESULT( 13 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 9 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 13 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 10: Check if x is in the rowspace of A +* Test 14: Check if x is in the rowspace of A * - RESULT( 10 ) = ZERO + RESULT( 14 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 10 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 14 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * @@ -792,45 +951,45 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, $ N, NRHS, -1, NB, ITYPE, NFAIL, $ NERRS, NOUT ) * -* Test 11: Compute relative error in svd +* Test 15: Compute relative error in svd * IF( RANK.GT.0 ) THEN CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 ) - RESULT( 11 ) = DASUM( MNMIN, S, 1 ) / + RESULT( 15 ) = DASUM( MNMIN, S, 1 ) / $ DASUM( MNMIN, COPYS, 1 ) / $ ( EPS*DBLE( MNMIN ) ) ELSE - RESULT( 11 ) = ZERO + RESULT( 15 ) = ZERO END IF * -* Test 12: Compute error in solution +* Test 16: Compute error in solution * CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK, $ LDWORK ) CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA, $ LDA, B, LDB, WORK, LDWORK, RWORK, - $ RESULT( 12 ) ) + $ RESULT( 16 ) ) * -* Test 13: Check norm of r'*A +* Test 17: Check norm of r'*A * - RESULT( 13 ) = ZERO + RESULT( 17 ) = ZERO IF( M.GT.CRANK ) - $ RESULT( 13 ) = ZQRT17( 'No transpose', 1, M, + $ RESULT( 17 ) = ZQRT17( 'No transpose', 1, M, $ N, NRHS, COPYA, LDA, B, LDB, $ COPYB, LDB, C, WORK, LWORK ) * -* Test 14: Check if x is in the rowspace of A +* Test 18: Check if x is in the rowspace of A * - RESULT( 14 ) = ZERO + RESULT( 18 ) = ZERO IF( N.GT.CRANK ) - $ RESULT( 14 ) = ZQRT14( 'No transpose', M, N, + $ RESULT( 18 ) = ZQRT14( 'No transpose', M, N, $ NRHS, COPYA, LDA, B, LDB, $ WORK, LWORK ) * * Print information about the tests that did not * pass the threshold. * - DO 80 K = 3, 14 + DO 80 K = 7, 18 IF( RESULT( K ).GE.THRESH ) THEN IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) $ CALL ALAHD( NOUT, PATH ) diff --git a/lapack-netlib/TESTING/LIN/zdrvpt.f b/lapack-netlib/TESTING/LIN/zdrvpt.f index 14a9f76ba0..75f4d57380 100644 --- a/lapack-netlib/TESTING/LIN/zdrvpt.f +++ b/lapack-netlib/TESTING/LIN/zdrvpt.f @@ -266,12 +266,12 @@ SUBROUTINE ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, A, D, * IA = 1 DO 20 I = 1, N - 1 - D( I ) = A( IA ) + D( I ) = DBLE( A( IA ) ) E( I ) = A( IA+1 ) IA = IA + 2 20 CONTINUE IF( N.GT.0 ) - $ D( N ) = A( IA ) + $ D( N ) = DBLE( A( IA ) ) ELSE * * Type 7-12: generate a diagonally dominant matrix with @@ -333,13 +333,13 @@ SUBROUTINE ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, A, D, Z( 2 ) = D( 1 ) D( 1 ) = ZERO IF( N.GT.1 ) THEN - Z( 3 ) = E( 1 ) + Z( 3 ) = DBLE( E( 1 ) ) E( 1 ) = ZERO END IF ELSE IF( IMAT.EQ.9 ) THEN IZERO = N IF( N.GT.1 ) THEN - Z( 1 ) = E( N-1 ) + Z( 1 ) = DBLE( E( N-1 ) ) E( N-1 ) = ZERO END IF Z( 2 ) = D( N ) @@ -347,9 +347,9 @@ SUBROUTINE ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, A, D, ELSE IF( IMAT.EQ.10 ) THEN IZERO = ( N+1 ) / 2 IF( IZERO.GT.1 ) THEN - Z( 1 ) = E( IZERO-1 ) + Z( 1 ) = DBLE( E( IZERO-1 ) ) E( IZERO-1 ) = ZERO - Z( 3 ) = E( IZERO ) + Z( 3 ) = DBLE( E( IZERO ) ) E( IZERO ) = ZERO END IF Z( 2 ) = D( IZERO ) diff --git a/lapack-netlib/TESTING/LIN/zdrvrf3.f b/lapack-netlib/TESTING/LIN/zdrvrf3.f index 7a44dba29f..4e55b03ef7 100644 --- a/lapack-netlib/TESTING/LIN/zdrvrf3.f +++ b/lapack-netlib/TESTING/LIN/zdrvrf3.f @@ -156,9 +156,10 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Functions .. + LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANGE COMPLEX*16 ZLARND - EXTERNAL DLAMCH, ZLARND, ZLANGE + EXTERNAL DLAMCH, ZLARND, ZLANGE, LSAME * .. * .. External Subroutines .. EXTERNAL ZTRTTF, ZGEQRF, ZGEQLF, ZTFSM, ZTRSM @@ -222,9 +223,9 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO 100 IALPHA = 1, 3 * - IF ( IALPHA.EQ. 1) THEN + IF ( IALPHA.EQ.1 ) THEN ALPHA = ZERO - ELSE IF ( IALPHA.EQ. 2) THEN + ELSE IF ( IALPHA.EQ.2 ) THEN ALPHA = ONE ELSE ALPHA = ZLARND( 4, ISEED ) @@ -263,7 +264,7 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, NA DO I = 1, NA - A( I, J) = ZLARND( 4, ISEED ) + A( I, J ) = ZLARND( 4, ISEED ) END DO END DO * @@ -276,6 +277,20 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL ZGEQRF( NA, NA, A, LDA, TAU, + Z_WORK_ZGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO J = 1, NA + DO I = 1, J + A( I, J ) = A( I, J ) / + + ( 2.0 * A( J, J ) ) + END DO + END DO + END IF +* ELSE * * The case IUPLO.EQ.2 is when SIDE.EQ.'L' @@ -285,6 +300,20 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, CALL ZGELQF( NA, NA, A, LDA, TAU, + Z_WORK_ZGEQRF, LDA, + INFO ) +* +* Forcing main diagonal of test matrix to +* be unit makes it ill-conditioned for +* some test cases +* + IF ( LSAME( DIAG, 'U' ) ) THEN + DO I = 1, NA + DO J = 1, I + A( I, J ) = A( I, J ) / + + ( 2.0 * A( I, I ) ) + END DO + END DO + END IF +* END IF * * After the QR factorization, the diagonal @@ -293,7 +322,8 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * value 1.0E+00. * DO J = 1, NA - A( J, J) = A(J,J) * ZLARND( 5, ISEED ) + A( J, J ) = A( J, J ) * + + ZLARND( 5, ISEED ) END DO * * Store a copy of A in RFP format (in ARF). @@ -307,8 +337,8 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = ZLARND( 4, ISEED ) - B2( I, J) = B1( I, J) + B1( I, J ) = ZLARND( 4, ISEED ) + B2( I, J ) = B1( I, J ) END DO END DO * @@ -331,24 +361,24 @@ SUBROUTINE ZDRVRF3( NOUT, NN, NVAL, THRESH, A, LDA, ARF, B1, B2, * DO J = 1, N DO I = 1, M - B1( I, J) = B2( I, J ) - B1( I, J ) + B1( I, J ) = B2( I, J ) - B1( I, J ) END DO END DO * - RESULT(1) = ZLANGE( 'I', M, N, B1, LDA, + RESULT( 1 ) = ZLANGE( 'I', M, N, B1, LDA, + D_WORK_ZLANGE ) * - RESULT(1) = RESULT(1) / SQRT( EPS ) - + / MAX ( MAX( M, N), 1 ) + RESULT( 1 ) = RESULT( 1 ) / SQRT( EPS ) + + / MAX ( MAX( M, N ), 1 ) * - IF( RESULT(1).GE.THRESH ) THEN + IF( RESULT( 1 ).GE.THRESH ) THEN IF( NFAIL.EQ.0 ) THEN WRITE( NOUT, * ) WRITE( NOUT, FMT = 9999 ) END IF WRITE( NOUT, FMT = 9997 ) 'ZTFSM', + CFORM, SIDE, UPLO, TRANS, DIAG, M, - + N, RESULT(1) + + N, RESULT( 1 ) NFAIL = NFAIL + 1 END IF * diff --git a/lapack-netlib/TESTING/LIN/zerrls.f b/lapack-netlib/TESTING/LIN/zerrls.f index 66e56c8c6b..22f049ee06 100644 --- a/lapack-netlib/TESTING/LIN/zerrls.f +++ b/lapack-netlib/TESTING/LIN/zerrls.f @@ -22,7 +22,7 @@ *> \verbatim *> *> ZERRLS tests the error exits for the COMPLEX*16 least squares -*> driver routines (ZGELS, CGELSS, CGELSY, CGELSD). +*> driver routines (ZGELS, ZGELST, ZGETSLS, CGELSS, CGELSY, CGELSD). *> \endverbatim * * Arguments: @@ -83,7 +83,8 @@ SUBROUTINE ZERRLS( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELSY + EXTERNAL ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELST, + $ ZGELSY, ZGETSLS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -130,10 +131,66 @@ SUBROUTINE ZERRLS( PATH, NUNIT ) INFOT = 8 CALL ZGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELS', INFOT, NOUT, LERR, OK ) INFOT = 10 CALL ZGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK ) * +* ZGELST +* + SRNAMT = 'ZGELST' + INFOT = 1 + CALL ZGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK ) +* +* ZGETSLS +* + SRNAMT = 'ZGETSLS' + INFOT = 1 + CALL ZGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO ) + CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK ) +* * ZGELSS * SRNAMT = 'ZGELSS' diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 098040ace3..211b921540 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -82,9 +82,10 @@ SUBROUTINE ZERRTR( PATH, NUNIT ) EXTERNAL LSAMEN * .. * .. External Subroutines .. - EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, ZTBCON, - $ ZTBRFS, ZTBTRS, ZTPCON, ZTPRFS, ZTPTRI, ZTPTRS, - $ ZTRCON, ZTRRFS, ZTRTI2, ZTRTRI, ZTRTRS + EXTERNAL ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, + $ ZLATRS3, ZTBCON, ZTBRFS, ZTBTRS, ZTPCON, + $ ZTPRFS, ZTPTRI, ZTPTRS, ZTRCON, ZTRRFS, ZTRTI2, + $ ZTRTRI, ZTRTRS * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -240,6 +241,46 @@ SUBROUTINE ZERRTR( PATH, NUNIT ) CALL ZLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO ) CALL CHKXER( 'ZLATRS', INFOT, NOUT, LERR, OK ) * +* ZLATRS3 +* + SRNAMT = 'ZLATRS3' + INFOT = 1 + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, + $ RW( 2 ), 1, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) + INFOT = 14 + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, + $ RW( 2 ), 0, INFO ) + CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) +* * Test error exits for the packed triangular routines. * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/zlattp.f b/lapack-netlib/TESTING/LIN/zlattp.f index b728852b5a..e05d9299e8 100644 --- a/lapack-netlib/TESTING/LIN/zlattp.f +++ b/lapack-netlib/TESTING/LIN/zlattp.f @@ -336,7 +336,7 @@ SUBROUTINE ZLATTP( IMAT, UPLO, TRANS, DIAG, ISEED, N, AP, B, WORK, WORK( J+1 ) = PLUS2 WORK( N+J+1 ) = ZERO PLUS1 = STAR1 / PLUS2 - REXP = ZLARND( 2, ISEED ) + REXP = DBLE( ZLARND( 2, ISEED ) ) IF( REXP.LT.ZERO ) THEN STAR1 = -SFAC**( ONE-REXP )*ZLARND( 5, ISEED ) ELSE @@ -790,7 +790,7 @@ SUBROUTINE ZLATTP( IMAT, UPLO, TRANS, DIAG, ISEED, N, AP, B, WORK, DO 460 J = 1, N / 2 JL = JJ DO 450 I = J, N - J - T = AP( JR-I+J ) + T = DBLE( AP( JR-I+J ) ) AP( JR-I+J ) = AP( JL ) AP( JL ) = T JL = JL + I @@ -804,7 +804,7 @@ SUBROUTINE ZLATTP( IMAT, UPLO, TRANS, DIAG, ISEED, N, AP, B, WORK, DO 480 J = 1, N / 2 JR = JJ DO 470 I = J, N - J - T = AP( JL+I-J ) + T = DBLE( AP( JL+I-J ) ) AP( JL+I-J ) = AP( JR ) AP( JR ) = T JR = JR - I diff --git a/lapack-netlib/TESTING/LIN/zpbt01.f b/lapack-netlib/TESTING/LIN/zpbt01.f index fb7881ac7e..1801b66cff 100644 --- a/lapack-netlib/TESTING/LIN/zpbt01.f +++ b/lapack-netlib/TESTING/LIN/zpbt01.f @@ -201,7 +201,8 @@ SUBROUTINE ZPBT01( UPLO, N, KD, A, LDA, AFAC, LDAFAC, RWORK, * * Compute the (K,K) element of the result. * - AKK = ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) + AKK = DBLE( + $ ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) ) AFAC( KD+1, K ) = AKK * * Compute the rest of column K. @@ -228,7 +229,7 @@ SUBROUTINE ZPBT01( UPLO, N, KD, A, LDA, AFAC, LDAFAC, RWORK, * * Scale column K by the diagonal element. * - AKK = AFAC( 1, K ) + AKK = DBLE( AFAC( 1, K ) ) CALL ZDSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 ) * 40 CONTINUE diff --git a/lapack-netlib/TESTING/LIN/zpot01.f b/lapack-netlib/TESTING/LIN/zpot01.f index d71445cd42..de83414c63 100644 --- a/lapack-netlib/TESTING/LIN/zpot01.f +++ b/lapack-netlib/TESTING/LIN/zpot01.f @@ -176,7 +176,7 @@ SUBROUTINE ZPOT01( UPLO, N, A, LDA, AFAC, LDAFAC, RWORK, RESID ) * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. @@ -224,7 +224,7 @@ SUBROUTINE ZPOT01( UPLO, N, A, LDA, AFAC, LDAFAC, RWORK, RESID ) 70 CONTINUE END IF * -* Compute norm( L*U - A ) / ( N * norm(A) * EPS ) +* Compute norm(L*U - A) / ( N * norm(A) * EPS ) * RESID = ZLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK ) * diff --git a/lapack-netlib/TESTING/LIN/zppt01.f b/lapack-netlib/TESTING/LIN/zppt01.f index 78ec595af4..acaea50d20 100644 --- a/lapack-netlib/TESTING/LIN/zppt01.f +++ b/lapack-netlib/TESTING/LIN/zppt01.f @@ -178,7 +178,7 @@ SUBROUTINE ZPPT01( UPLO, N, A, AFAC, RWORK, RESID ) * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) ) AFAC( KC+K-1 ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/LIN/zpst01.f b/lapack-netlib/TESTING/LIN/zpst01.f index 6918572197..bed18c514d 100644 --- a/lapack-netlib/TESTING/LIN/zpst01.f +++ b/lapack-netlib/TESTING/LIN/zpst01.f @@ -219,7 +219,7 @@ SUBROUTINE ZPST01( UPLO, N, A, LDA, AFAC, LDAFAC, PERM, LDPERM, * * Compute the (K,K) element of the result. * - TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) + TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) ) AFAC( K, K ) = TR * * Compute the rest of column K. diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index 62a215b58f..822a1eee07 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -40,27 +40,40 @@ ifneq ($(C_LAPACK), 1) $(FC) $(FFLAGS) -c -o $@ $< endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" SCATGEN = slatm1.o slatm7.o slaran.o slarnd.o +endif +ifeq ($(BUILD_SINGLE),1) SMATGEN = slatms.o slatme.o slatmr.o slatmt.o \ slagge.o slagsy.o slakf2.o slarge.o slaror.o slarot.o slatm2.o \ slatm3.o slatm5.o slatm6.o slahilb.o +endif +ifeq ($(BUILD_COMPLEX),1) CMATGEN = clatms.o clatme.o clatmr.o clatmt.o \ clagge.o claghe.o clagsy.o clakf2.o clarge.o claror.o clarot.o \ clatm1.o clarnd.o clatm2.o clatm3.o clatm5.o clatm6.o clahilb.o +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DZATGEN = dlatm1.o dlatm7.o dlaran.o dlarnd.o +endif +ifeq ($(BUILD_DOUBLE),1) DMATGEN = dlatms.o dlatme.o dlatmr.o dlatmt.o \ dlagge.o dlagsy.o dlakf2.o dlarge.o dlaror.o dlarot.o dlatm2.o \ dlatm3.o dlatm5.o dlatm6.o dlahilb.o +endif +ifeq ($(BUILD_COMPLEX16),1) ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ zlagge.o zlaghe.o zlagsy.o zlakf2.o zlarge.o zlaror.o zlarot.o \ zlatm1.o zlarnd.o zlatm2.o zlatm3.o zlatm5.o zlatm6.o zlahilb.o +endif .PHONY: all +.NOTPARALLEL: all: $(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ @@ -107,9 +120,17 @@ cleanlib: rm -f $(TMGLIB) ifneq ($(C_LAPACK), 1) +ifeq ($(filter $(BUILD_SINGLE) $(BUILD_COMPLEX),1),) slaran.o: slaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +endif +ifeq ($(filter $(BUILD_DOUBLE) $(BUILD_COMPLEX16),1),) dlaran.o: dlaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +endif else +ifeq ($(filter $(BUILD_SINGLE) $(BUILD_COMPLEX),1),) slaran.o: slaran.c ; $(CC) $(CFLAGS) -O0 -c -o $@ $< +endif +ifeq ($(filter $(BUILD_DOUBLE) $(BUILD_COMPLEX16),1),) dlaran.o: dlaran.c ; $(CC) $(CFLAGS) -O0 -c -o $@ $< endif +endif diff --git a/lapack-netlib/TESTING/MATGEN/clagge.c b/lapack-netlib/TESTING/MATGEN/clagge.c index 838df0c029..f05905bd72 100644 --- a/lapack-netlib/TESTING/MATGEN/clagge.c +++ b/lapack-netlib/TESTING/MATGEN/clagge.c @@ -627,7 +627,7 @@ static integer c__1 = 1; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clagge_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void clagge_(integer *m, integer *n, integer *kl, integer *ku, real *d__, complex *a, integer *lda, integer *iseed, complex *work, integer *info) { @@ -638,16 +638,17 @@ static integer c__1 = 1; /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cscal_(integer *, complex *, complex *, integer *), cgemv_(char * , integer *, integer *, complex *, complex *, integer *, complex * , integer *, complex *, complex *, integer *); extern real scnrm2_(integer *, complex *, integer *); complex wa, wb; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); real wn; - extern /* Subroutine */ int xerbla_(char *, integer *), clarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clarnv_( integer *, integer *, integer *, complex *); complex tau; @@ -686,8 +687,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("CLAGGE", &i__1); - return 0; + xerbla_("CLAGGE", &i__1, 6); + return; } /* initialize A to diagonal matrix */ @@ -713,7 +714,7 @@ static integer c__1 = 1; /* Quick exit if the user wants a diagonal matrix */ if (*kl == 0 && *ku == 0) { - return 0; + return; } /* pre- and post-multiply A by random unitary matrices */ @@ -1024,7 +1025,7 @@ static integer c__1 = 1; } /* L70: */ } - return 0; + return; /* End of CLAGGE */ diff --git a/lapack-netlib/TESTING/MATGEN/claghe.c b/lapack-netlib/TESTING/MATGEN/claghe.c index 084a3264e9..77ed949125 100644 --- a/lapack-netlib/TESTING/MATGEN/claghe.c +++ b/lapack-netlib/TESTING/MATGEN/claghe.c @@ -615,7 +615,7 @@ static integer c__1 = 1; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int claghe_(integer *n, integer *k, real *d__, complex *a, +/* Subroutine */ void claghe_(integer *n, integer *k, real *d__, complex *a, integer *lda, integer *iseed, complex *work, integer *info) { /* System generated locals */ @@ -624,17 +624,17 @@ static integer c__1 = 1; complex q__1, q__2, q__3, q__4; /* Local variables */ - extern /* Subroutine */ int cher2_(char *, integer *, complex *, complex * + extern /* Subroutine */ void cher2_(char *, integer *, complex *, complex * , integer *, complex *, integer *, complex *, integer *); integer i__, j; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), chemv_(char *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, @@ -643,7 +643,8 @@ static integer c__1 = 1; extern real scnrm2_(integer *, complex *, integer *); complex wa, wb; real wn; - extern /* Subroutine */ int xerbla_(char *, integer *), clarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clarnv_( integer *, integer *, integer *, complex *); complex tau; @@ -678,8 +679,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("CLAGHE", &i__1); - return 0; + xerbla_("CLAGHE", &i__1, 6); + return; } /* initialize lower triangle of A to diagonal matrix */ @@ -857,7 +858,7 @@ static integer c__1 = 1; } /* L80: */ } - return 0; + return; /* End of CLAGHE */ diff --git a/lapack-netlib/TESTING/MATGEN/clagsy.c b/lapack-netlib/TESTING/MATGEN/clagsy.c index 908bdc4a8a..de215bc634 100644 --- a/lapack-netlib/TESTING/MATGEN/clagsy.c +++ b/lapack-netlib/TESTING/MATGEN/clagsy.c @@ -615,7 +615,7 @@ static integer c__1 = 1; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clagsy_(integer *n, integer *k, real *d__, complex *a, +/* Subroutine */ void clagsy_(integer *n, integer *k, real *d__, complex *a, integer *lda, integer *iseed, complex *work, integer *info) { /* System generated locals */ @@ -626,14 +626,14 @@ static integer c__1 = 1; /* Local variables */ integer i__, j; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern /* Complex */ VOID cdotc_(complex *, integer *, complex *, integer *, complex *, integer *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *), caxpy_(integer *, complex *, complex *, integer *, complex *, integer *), csymv_(char *, integer *, @@ -642,9 +642,10 @@ static integer c__1 = 1; extern real scnrm2_(integer *, complex *, integer *); integer ii, jj; complex wa, wb; - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); real wn; - extern /* Subroutine */ int xerbla_(char *, integer *), clarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clarnv_( integer *, integer *, integer *, complex *); complex tau; @@ -679,8 +680,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("CLAGSY", &i__1); - return 0; + xerbla_("CLAGSY", &i__1, 6); + return; } /* initialize lower triangle of A to diagonal matrix */ @@ -910,7 +911,7 @@ static integer c__1 = 1; } /* L120: */ } - return 0; + return; /* End of CLAGSY */ diff --git a/lapack-netlib/TESTING/MATGEN/clahilb.c b/lapack-netlib/TESTING/MATGEN/clahilb.c index 4c4fe1bb92..95b00fb3e5 100644 --- a/lapack-netlib/TESTING/MATGEN/clahilb.c +++ b/lapack-netlib/TESTING/MATGEN/clahilb.c @@ -645,7 +645,7 @@ static complex c_b6 = {0.f,0.f}; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clahilb_(integer *n, integer *nrhs, complex *a, integer * +/* Subroutine */ void clahilb_(integer *n, integer *nrhs, complex *a, integer * lda, complex *x, integer *ldx, complex *b, integer *ldb, real *work, integer *info, char *path) { @@ -670,9 +670,9 @@ static complex c_b6 = {0.f,0.f}; integer i__, j, m, r__; char c2[2]; integer ti, tm; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); extern logical lsamen_(integer *, char *, char *); complex tmp; @@ -722,8 +722,8 @@ static complex c_b6 = {0.f,0.f}; } if (*info < 0) { i__1 = -(*info); - xerbla_("CLAHILB", &i__1); - return 0; + xerbla_("CLAHILB", &i__1, 7); + return; } if (*n > 6) { *info = 1; @@ -830,6 +830,6 @@ static complex c_b6 = {0.f,0.f}; } } } - return 0; + return; } /* clahilb_ */ diff --git a/lapack-netlib/TESTING/MATGEN/clakf2.c b/lapack-netlib/TESTING/MATGEN/clakf2.c index e24f47f2d0..e15bc58abd 100644 --- a/lapack-netlib/TESTING/MATGEN/clakf2.c +++ b/lapack-netlib/TESTING/MATGEN/clakf2.c @@ -615,7 +615,7 @@ static complex c_b1 = {0.f,0.f}; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clakf2_(integer *m, integer *n, complex *a, integer *lda, +/* Subroutine */ void clakf2_(integer *m, integer *n, complex *a, integer *lda, complex *b, complex *d__, complex *e, complex *z__, integer *ldz) { /* System generated locals */ @@ -625,7 +625,7 @@ static complex c_b1 = {0.f,0.f}; /* Local variables */ integer i__, j, l, ik, jk, mn; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *); integer mn2; @@ -737,7 +737,7 @@ static complex c_b1 = {0.f,0.f}; /* L90: */ } - return 0; + return; /* End of CLAKF2 */ diff --git a/lapack-netlib/TESTING/MATGEN/clarge.c b/lapack-netlib/TESTING/MATGEN/clarge.c index 299ec93087..0215436b71 100644 --- a/lapack-netlib/TESTING/MATGEN/clarge.c +++ b/lapack-netlib/TESTING/MATGEN/clarge.c @@ -600,7 +600,7 @@ static integer c__1 = 1; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clarge_(integer *n, complex *a, integer *lda, integer * +/* Subroutine */ void clarge_(integer *n, complex *a, integer *lda, integer * iseed, complex *work, integer *info) { /* System generated locals */ @@ -610,7 +610,7 @@ static integer c__1 = 1; /* Local variables */ integer i__; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cscal_(integer *, complex *, complex *, integer *), cgemv_(char * , integer *, integer *, complex *, complex *, integer *, complex * @@ -618,7 +618,8 @@ static integer c__1 = 1; extern real scnrm2_(integer *, complex *, integer *); complex wa, wb; real wn; - extern /* Subroutine */ int xerbla_(char *, integer *), clarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void clarnv_( integer *, integer *, integer *, complex *); complex tau; @@ -650,8 +651,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("CLARGE", &i__1); - return 0; + xerbla_("CLARGE", &i__1, 6); + return; } /* pre- and post-multiply A by random unitary matrix */ @@ -702,7 +703,7 @@ static integer c__1 = 1; * a_dim1 + 1], lda); /* L10: */ } - return 0; + return; /* End of CLARGE */ diff --git a/lapack-netlib/TESTING/MATGEN/claror.c b/lapack-netlib/TESTING/MATGEN/claror.c index 1fb287af32..cd0d15300d 100644 --- a/lapack-netlib/TESTING/MATGEN/claror.c +++ b/lapack-netlib/TESTING/MATGEN/claror.c @@ -672,7 +672,7 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int claror_(char *side, char *init, integer *m, integer *n, +/* Subroutine */ void claror_(char *side, char *init, integer *m, integer *n, complex *a, integer *lda, integer *iseed, complex *x, integer *info) { /* System generated locals */ @@ -683,23 +683,23 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t integer kbeg, jcol; real xabs; integer irow, j; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *), cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); complex csign; integer ixfrm, itype, nxfrm; real xnorm; extern real scnrm2_(integer *, complex *, integer *); - extern /* Subroutine */ int clacgv_(integer *, complex *, integer *); + extern /* Subroutine */ void clacgv_(integer *, complex *, integer *); //extern /* Complex */ VOID clarnd_(complex *, integer *, integer *); extern complex clarnd_(integer *, integer *); - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex - *, complex *, complex *, integer *), xerbla_(char *, - integer *); + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex + *, complex *, complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); real factor; complex xnorms; @@ -723,7 +723,7 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t /* Function Body */ *info = 0; if (*n == 0 || *m == 0) { - return 0; + return; } itype = 0; @@ -750,8 +750,8 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t } if (*info != 0) { i__1 = -(*info); - xerbla_("CLAROR", &i__1); - return 0; + xerbla_("CLAROR", &i__1, 6); + return; } if (itype == 1) { @@ -815,8 +815,8 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t if (abs(factor) < 1e-20f) { *info = 1; i__2 = -(*info); - xerbla_("CLAROR", &i__2); - return 0; + xerbla_("CLAROR", &i__2, 6); + return; } else { factor = 1.f / factor; } @@ -899,7 +899,7 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t /* L90: */ } } - return 0; + return; /* End of CLAROR */ diff --git a/lapack-netlib/TESTING/MATGEN/clarot.c b/lapack-netlib/TESTING/MATGEN/clarot.c index 9bc295062d..f1e73ed5be 100644 --- a/lapack-netlib/TESTING/MATGEN/clarot.c +++ b/lapack-netlib/TESTING/MATGEN/clarot.c @@ -739,7 +739,7 @@ static integer c__8 = 8; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clarot_(logical *lrows, logical *lleft, logical *lright, +/* Subroutine */ void clarot_(logical *lrows, logical *lleft, logical *lright, integer *nl, complex *c__, complex *s, complex *a, integer *lda, complex *xleft, complex *xright) { @@ -752,7 +752,7 @@ static integer c__8 = 8; complex tempx; integer ix, iy, nt; complex xt[2], yt[2]; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer iyt; @@ -804,12 +804,12 @@ static integer c__8 = 8; /* Check for errors */ if (*nl < nt) { - xerbla_("CLAROT", &c__4); - return 0; + xerbla_("CLAROT", &c__4, 6); + return; } if (*lda <= 0 || ! (*lrows) && *lda < *nl - nt) { - xerbla_("CLAROT", &c__8); - return 0; + xerbla_("CLAROT", &c__8, 6); + return; } /* Rotate */ @@ -887,7 +887,7 @@ static integer c__8 = 8; a[i__1].r = yt[i__2].r, a[i__1].i = yt[i__2].i; } - return 0; + return; /* End of CLAROT */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm1.c b/lapack-netlib/TESTING/MATGEN/clatm1.c index a2a62a5f47..665b160571 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm1.c +++ b/lapack-netlib/TESTING/MATGEN/clatm1.c @@ -647,7 +647,7 @@ static integer c__3 = 3; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clatm1_(integer *mode, real *cond, integer *irsign, +/* Subroutine */ void clatm1_(integer *mode, real *cond, integer *irsign, integer *idist, integer *iseed, complex *d__, integer *n, integer * info) { @@ -664,9 +664,9 @@ static integer c__3 = 3; complex ctemp; //extern /* Complex */ VOID clarnd_(complex *, integer *, integer *); extern complex clarnd_(integer *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slaran_(integer *); - extern /* Subroutine */ int clarnv_(integer *, integer *, integer *, + extern /* Subroutine */ void clarnv_(integer *, integer *, integer *, complex *); @@ -691,7 +691,7 @@ static integer c__3 = 3; /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set INFO if an error */ @@ -711,8 +711,8 @@ static integer c__3 = 3; if (*info != 0) { i__1 = -(*info); - xerbla_("CLATM1", &i__1); - return 0; + xerbla_("CLATM1", &i__1, 6); + return; } /* Compute D according to COND and MODE */ @@ -848,7 +848,7 @@ static integer c__3 = 3; } - return 0; + return; /* End of CLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm5.c b/lapack-netlib/TESTING/MATGEN/clatm5.c index dbd1ee642b..c2b81ccf3a 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm5.c +++ b/lapack-netlib/TESTING/MATGEN/clatm5.c @@ -778,7 +778,7 @@ static complex c_b5 = {20.f,0.f}; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int clatm5_(integer *prtype, integer *m, integer *n, complex +/* Subroutine */ void clatm5_(integer *prtype, integer *m, integer *n, complex *a, integer *lda, complex *b, integer *ldb, complex *c__, integer * ldc, complex *d__, integer *ldd, complex *e, integer *lde, complex *f, integer *ldf, complex *r__, integer *ldr, complex *l, integer *ldl, @@ -793,7 +793,7 @@ static complex c_b5 = {20.f,0.f}; /* Local variables */ integer i__, j, k; - extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgemm_(char *, char *, integer *, integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, complex *, integer *); complex imeps, reeps; @@ -1277,6 +1277,6 @@ static complex c_b5 = {20.f,0.f}; /* End of CLATM5 */ - return 0; + return; } /* clatm5_ */ diff --git a/lapack-netlib/TESTING/MATGEN/clatm6.c b/lapack-netlib/TESTING/MATGEN/clatm6.c index 54c0af9dab..96a3bd290e 100644 --- a/lapack-netlib/TESTING/MATGEN/clatm6.c +++ b/lapack-netlib/TESTING/MATGEN/clatm6.c @@ -686,7 +686,7 @@ static integer c__24 = 24; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clatm6_(integer *type__, integer *n, complex *a, integer +/* Subroutine */ void clatm6_(integer *type__, integer *n, complex *a, integer *lda, complex *b, complex *x, integer *ldx, complex *y, integer *ldy, complex *alpha, complex *beta, complex *wx, complex *wy, real *s, real *dif) @@ -702,10 +702,10 @@ static integer c__24 = 24; complex work[26]; integer i__, j; complex z__[64] /* was [8][8] */; - extern /* Subroutine */ int clakf2_(integer *, integer *, complex *, + extern /* Subroutine */ void clakf2_(integer *, integer *, complex *, integer *, complex *, complex *, complex *, complex *, integer *); real rwork[50]; - extern /* Subroutine */ int cgesvd_(char *, char *, integer *, integer *, + extern /* Subroutine */ void cgesvd_(char *, char *, integer *, integer *, complex *, integer *, real *, complex *, integer *, complex *, integer *, complex *, integer *, real *, integer *), clacpy_(char *, integer *, integer *, complex *, integer *, complex *, integer *); @@ -931,7 +931,7 @@ static integer c__24 = 24; &c__1, &work[2], &c__24, &rwork[8], &info); dif[5] = rwork[7]; - return 0; + return; /* End of CLATM6 */ diff --git a/lapack-netlib/TESTING/MATGEN/clatme.c b/lapack-netlib/TESTING/MATGEN/clatme.c index 72effbf9a0..a905f5608d 100644 --- a/lapack-netlib/TESTING/MATGEN/clatme.c +++ b/lapack-netlib/TESTING/MATGEN/clatme.c @@ -811,7 +811,7 @@ static integer c__5 = 5; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clatme_(integer *n, char *dist, integer *iseed, complex * +/* Subroutine */ void clatme_(integer *n, char *dist, integer *iseed, complex * d__, integer *mode, real *cond, complex *dmax__, char *rsign, char * upper, char *sim, real *ds, integer *modes, real *conds, integer *kl, integer *ku, real *anorm, complex *a, integer *lda, complex *work, @@ -827,22 +827,22 @@ static integer c__5 = 5; integer isim; real temp; integer i__, j; - extern /* Subroutine */ int cgerc_(integer *, integer *, complex *, + extern /* Subroutine */ void cgerc_(integer *, integer *, complex *, complex *, integer *, complex *, integer *, complex *, integer *); complex alpha; - extern /* Subroutine */ int cscal_(integer *, complex *, complex *, + extern /* Subroutine */ void cscal_(integer *, complex *, complex *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int cgemv_(char *, integer *, integer *, complex * + extern /* Subroutine */ void cgemv_(char *, integer *, integer *, complex * , complex *, integer *, complex *, integer *, complex *, complex * , integer *); integer iinfo; real tempa[1]; integer icols, idist; - extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + extern /* Subroutine */ void ccopy_(integer *, complex *, integer *, complex *, integer *); integer irows; - extern /* Subroutine */ int clatm1_(integer *, real *, integer *, integer + extern /* Subroutine */ void clatm1_(integer *, real *, integer *, integer *, integer *, complex *, integer *, integer *), slatm1_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); @@ -850,17 +850,18 @@ static integer c__5 = 5; extern real clange_(char *, integer *, integer *, complex *, integer *, real *); integer ir; - extern /* Subroutine */ int clarge_(integer *, complex *, integer *, + extern /* Subroutine */ void clarge_(integer *, complex *, integer *, integer *, complex *, integer *), clarfg_(integer *, complex *, complex *, integer *, complex *), clacgv_(integer *, complex *, integer *); //extern /* Complex */ VOID clarnd_(complex *, integer *, integer *); extern complex clarnd_(integer *, integer *); real ralpha; - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *), claset_(char *, integer *, integer *, complex *, complex *, - complex *, integer *), xerbla_(char *, integer *), - clarnv_(integer *, integer *, integer *, complex *); + complex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clarnv_(integer *, integer *, integer *, complex *); integer irsign, iupper; complex xnorms; integer jcr; @@ -894,7 +895,7 @@ static integer c__5 = 5; /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Decode DIST */ @@ -986,8 +987,8 @@ static integer c__5 = 5; if (*info != 0) { i__1 = -(*info); - xerbla_("CLATME", &i__1); - return 0; + xerbla_("CLATME", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1008,7 +1009,7 @@ static integer c__5 = 5; clatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], n, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && abs(*mode) != 6) { @@ -1028,7 +1029,7 @@ static integer c__5 = 5; alpha.r = q__1.r, alpha.i = q__1.i; } else { *info = 2; - return 0; + return; } cscal_(n, &alpha, &d__[1], &c__1); @@ -1065,7 +1066,7 @@ static integer c__5 = 5; slatm1_(modes, conds, &c__0, &c__0, &iseed[1], &ds[1], n, &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } /* Multiply by V and V' */ @@ -1073,7 +1074,7 @@ static integer c__5 = 5; clarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } /* Multiply by S and (1/S) */ @@ -1086,7 +1087,7 @@ static integer c__5 = 5; csscal_(n, &r__1, &a[j * a_dim1 + 1], &c__1); } else { *info = 5; - return 0; + return; } /* L50: */ } @@ -1096,7 +1097,7 @@ static integer c__5 = 5; clarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } } @@ -1210,7 +1211,7 @@ static integer c__5 = 5; } } - return 0; + return; /* End of CLATME */ diff --git a/lapack-netlib/TESTING/MATGEN/clatmr.c b/lapack-netlib/TESTING/MATGEN/clatmr.c index 1265052ab5..4a117a9d4a 100644 --- a/lapack-netlib/TESTING/MATGEN/clatmr.c +++ b/lapack-netlib/TESTING/MATGEN/clatmr.c @@ -998,7 +998,7 @@ static integer c__1 = 1; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clatmr_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void clatmr_(integer *m, integer *n, char *dist, integer * iseed, char *sym, complex *d__, integer *mode, real *cond, complex * dmax__, char *rsign, char *grade, complex *dl, integer *model, real * condl, complex *dr, integer *moder, real *condr, char *pivtng, @@ -1022,7 +1022,7 @@ static integer c__1 = 1; integer mnsub; real onorm; integer mxsub, npvts; - extern /* Subroutine */ int clatm1_(integer *, real *, integer *, integer + extern /* Subroutine */ void clatm1_(integer *, real *, integer *, integer *, integer *, complex *, integer *, integer *); extern /* Complex */ VOID clatm2_(complex *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, @@ -1038,10 +1038,10 @@ static integer c__1 = 1; integer igrade; extern real clansb_(char *, char *, integer *, integer *, complex *, integer *, real *); - extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer + extern /* Subroutine */ void csscal_(integer *, real *, complex *, integer *); logical fulbnd; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical badpvt; extern real clansp_(char *, char *, integer *, complex *, real *), clansy_(char *, char *, integer *, complex *, integer *, real *); @@ -1077,7 +1077,7 @@ static integer c__1 = 1; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1278,8 +1278,8 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); - xerbla_("CLATMR", &i__1); - return 0; + xerbla_("CLATMR", &i__1, 6); + return; } /* Decide if we can pivot consistently */ @@ -1305,7 +1305,7 @@ static integer c__1 = 1; clatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, info); if (*info != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && *mode != -6 && *mode != 6) { @@ -1321,7 +1321,7 @@ static integer c__1 = 1; } if (temp == 0.f && (dmax__->r != 0.f || dmax__->i != 0.f)) { *info = 2; - return 0; + return; } if (temp != 0.f) { q__1.r = dmax__->r / temp, q__1.i = dmax__->i / temp; @@ -1361,7 +1361,7 @@ static integer c__1 = 1; clatm1_(model, condl, &c__0, &idist, &iseed[1], &dl[1], m, info); if (*info != 0) { *info = 3; - return 0; + return; } } @@ -1371,7 +1371,7 @@ static integer c__1 = 1; clatm1_(moder, condr, &c__0, &idist, &iseed[1], &dr[1], n, info); if (*info != 0) { *info = 4; - return 0; + return; } } @@ -2026,7 +2026,7 @@ static integer c__1 = 1; /* Desired scaling impossible */ *info = 5; - return 0; + return; } else if (*anorm > 1.f && onorm < 1.f || *anorm < 1.f && onorm > 1.f) { @@ -2099,6 +2099,6 @@ static integer c__1 = 1; /* End of CLATMR */ - return 0; + return; } /* clatmr_ */ diff --git a/lapack-netlib/TESTING/MATGEN/clatms.c b/lapack-netlib/TESTING/MATGEN/clatms.c index 6f878e400e..3f7011166b 100644 --- a/lapack-netlib/TESTING/MATGEN/clatms.c +++ b/lapack-netlib/TESTING/MATGEN/clatms.c @@ -845,7 +845,7 @@ static logical c_false = FALSE_; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clatms_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void clatms_(integer *m, integer *n, char *dist, integer * iseed, char *sym, real *d__, integer *mode, real *cond, real *dmax__, integer *kl, integer *ku, char *pack, complex *a, integer *lda, complex *work, integer *info) @@ -870,14 +870,14 @@ static logical c_false = FALSE_; integer ioffg; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); complex ctemp; integer idist, mnmin, iskew; complex extra, dummy; - extern /* Subroutine */ int slatm1_(integer *, real *, integer *, integer + extern /* Subroutine */ void slatm1_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); integer ic, jc, nc; - extern /* Subroutine */ int clagge_(integer *, integer *, integer *, + extern /* Subroutine */ void clagge_(integer *, integer *, integer *, integer *, real *, complex *, integer *, integer *, complex *, integer *), claghe_(integer *, integer *, real *, complex *, integer *, integer *, complex *, integer *); @@ -888,13 +888,14 @@ static logical c_false = FALSE_; extern complex clarnd_(integer *, integer *); integer minlda; complex st; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, - complex *, real *, complex *, complex *), xerbla_(char *, integer - *), clagsy_(integer *, integer *, real *, complex *, + complex *, real *, complex *, complex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clagsy_(integer *, integer *, real *, complex *, integer *, integer *, complex *, integer *); extern real slarnd_(integer *, integer *); - extern /* Subroutine */ int clarot_(logical *, logical *, logical *, + extern /* Subroutine */ void clarot_(logical *, logical *, logical *, integer *, complex *, complex *, complex *, integer *, complex *, complex *); logical iltemp, givens; @@ -929,7 +930,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1068,8 +1069,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("CLATMS", &i__1); - return 0; + xerbla_("CLATMS", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1090,7 +1091,7 @@ static logical c_false = FALSE_; slatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1119,7 +1120,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } sscal_(&mnmin, &alpha, &d__[1], &c__1); @@ -2033,7 +2034,7 @@ static logical c_false = FALSE_; if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -2208,7 +2209,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of CLATMS */ diff --git a/lapack-netlib/TESTING/MATGEN/clatmt.c b/lapack-netlib/TESTING/MATGEN/clatmt.c index 04dc346972..3bb7ca4d94 100644 --- a/lapack-netlib/TESTING/MATGEN/clatmt.c +++ b/lapack-netlib/TESTING/MATGEN/clatmt.c @@ -853,7 +853,7 @@ static logical c_false = FALSE_; /* > \ingroup complex_matgen */ /* ===================================================================== */ -/* Subroutine */ int clatmt_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void clatmt_(integer *m, integer *n, char *dist, integer * iseed, char *sym, real *d__, integer *mode, real *cond, real *dmax__, integer *rank, integer *kl, integer *ku, char *pack, complex *a, integer *lda, complex *work, integer *info) @@ -876,16 +876,16 @@ static logical c_false = FALSE_; integer ipack, ioffg; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); complex ctemp; integer idist, mnmin; complex extra; integer iskew; complex dummy; - extern /* Subroutine */ int slatm7_(integer *, real *, integer *, integer + extern /* Subroutine */ void slatm7_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *, integer *); integer ic, jc, nc; - extern /* Subroutine */ int clagge_(integer *, integer *, integer *, + extern /* Subroutine */ void clagge_(integer *, integer *, integer *, integer *, real *, complex *, integer *, integer *, complex *, integer *), claghe_(integer *, integer *, real *, complex *, integer *, integer *, complex *, integer *); @@ -896,13 +896,14 @@ static logical c_false = FALSE_; extern complex clarnd_(integer *, integer *); integer minlda; complex st; - extern /* Subroutine */ int claset_(char *, integer *, integer *, complex + extern /* Subroutine */ void claset_(char *, integer *, integer *, complex *, complex *, complex *, integer *), clartg_(complex *, - complex *, real *, complex *, complex *), xerbla_(char *, integer - *), clagsy_(integer *, integer *, real *, complex *, + complex *, real *, complex *, complex *); + extern int xerbla_(char *, integer *, ftnlen); + extern void clagsy_(integer *, integer *, real *, complex *, integer *, integer *, complex *, integer *); extern real slarnd_(integer *, integer *); - extern /* Subroutine */ int clarot_(logical *, logical *, logical *, + extern /* Subroutine */ void clarot_(logical *, logical *, logical *, integer *, complex *, complex *, complex *, integer *, complex *, complex *); integer ioffst, irsign; @@ -936,7 +937,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1075,8 +1076,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("CLATMT", &i__1); - return 0; + xerbla_("CLATMT", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1098,7 +1099,7 @@ static logical c_false = FALSE_; iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1127,7 +1128,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } sscal_(rank, &alpha, &d__[1], &c__1); @@ -2041,7 +2042,7 @@ static logical c_false = FALSE_; if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -2216,7 +2217,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of CLATMT */ diff --git a/lapack-netlib/TESTING/MATGEN/dlagge.c b/lapack-netlib/TESTING/MATGEN/dlagge.c index 448f70a390..dcca843269 100644 --- a/lapack-netlib/TESTING/MATGEN/dlagge.c +++ b/lapack-netlib/TESTING/MATGEN/dlagge.c @@ -626,7 +626,7 @@ static doublereal c_b13 = 0.; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlagge_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void dlagge_(integer *m, integer *n, integer *kl, integer *ku, doublereal *d__, doublereal *a, integer *lda, integer *iseed, doublereal *work, integer *info) { @@ -635,17 +635,18 @@ static doublereal c_b13 = 0.; doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__, j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal wa, wb, wn; - extern /* Subroutine */ int xerbla_(char *, integer *), dlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlarnv_( integer *, integer *, integer *, doublereal *); doublereal tau; @@ -684,8 +685,8 @@ static doublereal c_b13 = 0.; } if (*info < 0) { i__1 = -(*info); - xerbla_("DLAGGE", &i__1); - return 0; + xerbla_("DLAGGE", &i__1, 6); + return; } /* initialize A to diagonal matrix */ @@ -708,7 +709,7 @@ static doublereal c_b13 = 0.; /* Quick exit if the user wants a diagonal matrix */ if (*kl == 0 && *ku == 0) { - return 0; + return; } /* pre- and post-multiply A by random orthogonal matrices */ @@ -963,7 +964,7 @@ static doublereal c_b13 = 0.; } /* L70: */ } - return 0; + return; /* End of DLAGGE */ diff --git a/lapack-netlib/TESTING/MATGEN/dlagsy.c b/lapack-netlib/TESTING/MATGEN/dlagsy.c index 802ce683c7..2812db86df 100644 --- a/lapack-netlib/TESTING/MATGEN/dlagsy.c +++ b/lapack-netlib/TESTING/MATGEN/dlagsy.c @@ -615,7 +615,7 @@ static doublereal c_b26 = 1.; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlagsy_(integer *n, integer *k, doublereal *d__, +/* Subroutine */ void dlagsy_(integer *n, integer *k, doublereal *d__, doublereal *a, integer *lda, integer *iseed, doublereal *work, integer *info) { @@ -624,17 +624,17 @@ static doublereal c_b26 = 1.; doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal ddot_(integer *, doublereal *, integer *, doublereal *, integer *), dnrm2_(integer *, doublereal *, integer *); - extern /* Subroutine */ int dsyr2_(char *, integer *, doublereal *, + extern /* Subroutine */ void dsyr2_(char *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer i__, j; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *), daxpy_(integer *, doublereal *, @@ -642,7 +642,8 @@ static doublereal c_b26 = 1.; integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal wa, wb, wn; - extern /* Subroutine */ int xerbla_(char *, integer *), dlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlarnv_( integer *, integer *, integer *, doublereal *); doublereal tau; @@ -677,8 +678,8 @@ static doublereal c_b26 = 1.; } if (*info < 0) { i__1 = -(*info); - xerbla_("DLAGSY", &i__1); - return 0; + xerbla_("DLAGSY", &i__1, 6); + return; } /* initialize lower triangle of A to diagonal matrix */ @@ -822,7 +823,7 @@ static doublereal c_b26 = 1.; } /* L80: */ } - return 0; + return; /* End of DLAGSY */ diff --git a/lapack-netlib/TESTING/MATGEN/dlahilb.c b/lapack-netlib/TESTING/MATGEN/dlahilb.c index 0f79547cf7..dccfcd8c77 100644 --- a/lapack-netlib/TESTING/MATGEN/dlahilb.c +++ b/lapack-netlib/TESTING/MATGEN/dlahilb.c @@ -635,7 +635,7 @@ static doublereal c_b4 = 0.; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlahilb_(integer *n, integer *nrhs, doublereal *a, +/* Subroutine */ void dlahilb_(integer *n, integer *nrhs, doublereal *a, integer *lda, doublereal *x, integer *ldx, doublereal *b, integer * ldb, doublereal *work, integer *info) { @@ -645,9 +645,9 @@ static doublereal c_b4 = 0.; /* Local variables */ integer i__, j, m, r__, ti, tm; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); /* -- LAPACK test routine (version 3.8.0) -- */ @@ -691,8 +691,8 @@ static doublereal c_b4 = 0.; } if (*info < 0) { i__1 = -(*info); - xerbla_("DLAHILB", &i__1); - return 0; + xerbla_("DLAHILB", &i__1, 7); + return; } if (*n > 6) { *info = 1; @@ -745,6 +745,6 @@ static doublereal c_b4 = 0.; } } - return 0; + return; } /* dlahilb_ */ diff --git a/lapack-netlib/TESTING/MATGEN/dlakf2.c b/lapack-netlib/TESTING/MATGEN/dlakf2.c index 5d838ef353..0b403540a8 100644 --- a/lapack-netlib/TESTING/MATGEN/dlakf2.c +++ b/lapack-netlib/TESTING/MATGEN/dlakf2.c @@ -615,7 +615,7 @@ static doublereal c_b3 = 0.; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlakf2_(integer *m, integer *n, doublereal *a, integer * +/* Subroutine */ void dlakf2_(integer *m, integer *n, doublereal *a, integer * lda, doublereal *b, doublereal *d__, doublereal *e, doublereal *z__, integer *ldz) { @@ -625,7 +625,7 @@ static doublereal c_b3 = 0.; /* Local variables */ integer i__, j, l, ik, jk, mn; - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *); integer mn2; @@ -731,7 +731,7 @@ static doublereal c_b3 = 0.; /* L90: */ } - return 0; + return; /* End of DLAKF2 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlarge.c b/lapack-netlib/TESTING/MATGEN/dlarge.c index 580c502345..5d8a81387e 100644 --- a/lapack-netlib/TESTING/MATGEN/dlarge.c +++ b/lapack-netlib/TESTING/MATGEN/dlarge.c @@ -600,7 +600,7 @@ static doublereal c_b10 = 0.; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlarge_(integer *n, doublereal *a, integer *lda, integer +/* Subroutine */ void dlarge_(integer *n, doublereal *a, integer *lda, integer *iseed, doublereal *work, integer *info) { /* System generated locals */ @@ -608,17 +608,18 @@ static doublereal c_b10 = 0.; doublereal d__1; /* Local variables */ - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); extern doublereal dnrm2_(integer *, doublereal *, integer *); integer i__; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *), dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal wa, wb, wn; - extern /* Subroutine */ int xerbla_(char *, integer *), dlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlarnv_( integer *, integer *, integer *, doublereal *); doublereal tau; @@ -650,8 +651,8 @@ static doublereal c_b10 = 0.; } if (*info < 0) { i__1 = -(*info); - xerbla_("DLARGE", &i__1); - return 0; + xerbla_("DLARGE", &i__1, 6); + return; } /* pre- and post-multiply A by random orthogonal matrix */ @@ -697,7 +698,7 @@ static doublereal c_b10 = 0.; a_dim1 + 1], lda); /* L10: */ } - return 0; + return; /* End of DLARGE */ diff --git a/lapack-netlib/TESTING/MATGEN/dlaror.c b/lapack-netlib/TESTING/MATGEN/dlaror.c index 8e66e55111..d9e2e46ae0 100644 --- a/lapack-netlib/TESTING/MATGEN/dlaror.c +++ b/lapack-netlib/TESTING/MATGEN/dlaror.c @@ -659,7 +659,7 @@ static integer c__1 = 1; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlaror_(char *side, char *init, integer *m, integer *n, +/* Subroutine */ void dlaror_(char *side, char *init, integer *m, integer *n, doublereal *a, integer *lda, integer *iseed, doublereal *x, integer * info) { @@ -669,24 +669,24 @@ static integer c__1 = 1; /* Local variables */ integer kbeg; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer jcol, irow; extern doublereal dnrm2_(integer *, doublereal *, integer *); integer j; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer ixfrm, itype, nxfrm; doublereal xnorm; extern doublereal dlarnd_(integer *, integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *); + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); doublereal factor, xnorms; @@ -709,7 +709,7 @@ static integer c__1 = 1; /* Function Body */ *info = 0; if (*n == 0 || *m == 0) { - return 0; + return; } itype = 0; @@ -734,8 +734,8 @@ static integer c__1 = 1; } if (*info != 0) { i__1 = -(*info); - xerbla_("DLAROR", &i__1); - return 0; + xerbla_("DLAROR", &i__1, 6); + return; } if (itype == 1) { @@ -782,8 +782,8 @@ static integer c__1 = 1; factor = xnorms * (xnorms + x[kbeg]); if (abs(factor) < 1e-20) { *info = 1; - xerbla_("DLAROR", info); - return 0; + xerbla_("DLAROR", info, 6); + return; } else { factor = 1. / factor; } @@ -837,7 +837,7 @@ static integer c__1 = 1; /* L50: */ } } - return 0; + return; /* End of DLAROR */ diff --git a/lapack-netlib/TESTING/MATGEN/dlarot.c b/lapack-netlib/TESTING/MATGEN/dlarot.c index 16ff2eeb09..1d7a4939d6 100644 --- a/lapack-netlib/TESTING/MATGEN/dlarot.c +++ b/lapack-netlib/TESTING/MATGEN/dlarot.c @@ -737,7 +737,7 @@ static integer c__1 = 1; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlarot_(logical *lrows, logical *lleft, logical *lright, +/* Subroutine */ void dlarot_(logical *lrows, logical *lleft, logical *lright, integer *nl, doublereal *c__, doublereal *s, doublereal *a, integer * lda, doublereal *xleft, doublereal *xright) { @@ -746,11 +746,11 @@ static integer c__1 = 1; /* Local variables */ integer iinc; - extern /* Subroutine */ int drot_(integer *, doublereal *, integer *, + extern /* Subroutine */ void drot_(integer *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *); integer inext, ix, iy, nt; doublereal xt[2], yt[2]; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer iyt; @@ -799,12 +799,12 @@ static integer c__1 = 1; /* Check for errors */ if (*nl < nt) { - xerbla_("DLAROT", &c__4); - return 0; + xerbla_("DLAROT", &c__4, 6); + return; } if (*lda <= 0 || ! (*lrows) && *lda < *nl - nt) { - xerbla_("DLAROT", &c__8); - return 0; + xerbla_("DLAROT", &c__8, 6); + return; } /* Rotate */ @@ -825,7 +825,7 @@ static integer c__1 = 1; a[iyt] = yt[nt - 1]; } - return 0; + return; /* End of DLAROT */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm1.c b/lapack-netlib/TESTING/MATGEN/dlatm1.c index 5c2837e223..aa81469e74 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm1.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm1.c @@ -641,7 +641,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatm1_(integer *mode, doublereal *cond, integer *irsign, +/* Subroutine */ void dlatm1_(integer *mode, doublereal *cond, integer *irsign, integer *idist, integer *iseed, doublereal *d__, integer *n, integer *info) { @@ -654,7 +654,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer i__; doublereal alpha; extern doublereal dlaran_(integer *); - extern /* Subroutine */ int xerbla_(char *, integer *), dlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlarnv_( integer *, integer *, integer *, doublereal *); @@ -679,7 +680,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set INFO if an error */ @@ -699,8 +700,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); - xerbla_("DLATM1", &i__1); - return 0; + xerbla_("DLATM1", &i__1, 6); + return; } /* Compute D according to COND and MODE */ @@ -814,7 +815,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm5.c b/lapack-netlib/TESTING/MATGEN/dlatm5.c index 5043da3985..94b49d6e35 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm5.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm5.c @@ -778,7 +778,7 @@ static doublereal c_b33 = -1.; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int dlatm5_(integer *prtype, integer *m, integer *n, +/* Subroutine */ void dlatm5_(integer *prtype, integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, doublereal * c__, integer *ldc, doublereal *d__, integer *ldd, doublereal *e, integer *lde, doublereal *f, integer *ldf, doublereal *r__, integer * @@ -792,7 +792,7 @@ static doublereal c_b33 = -1.; /* Local variables */ integer i__, j, k; - extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void dgemm_(char *, char *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); doublereal imeps, reeps; @@ -1100,6 +1100,6 @@ static doublereal c_b33 = -1.; /* End of DLATM5 */ - return 0; + return; } /* dlatm5_ */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm6.c b/lapack-netlib/TESTING/MATGEN/dlatm6.c index a1c995d24d..fee10724a8 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm6.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm6.c @@ -692,7 +692,7 @@ static integer c__60 = 60; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatm6_(integer *type__, integer *n, doublereal *a, +/* Subroutine */ void dlatm6_(integer *type__, integer *n, doublereal *a, integer *lda, doublereal *b, doublereal *x, integer *ldx, doublereal * y, integer *ldy, doublereal *alpha, doublereal *beta, doublereal *wx, doublereal *wy, doublereal *s, doublereal *dif) @@ -706,7 +706,7 @@ static integer c__60 = 60; doublereal work[100]; integer i__, j; doublereal z__[144] /* was [12][12] */; - extern /* Subroutine */ int dlakf2_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dlakf2_(integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, doublereal *, doublereal *, integer *), dgesvd_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *, @@ -866,7 +866,7 @@ static integer c__60 = 60; } - return 0; + return; /* End of DLATM6 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatm7.c b/lapack-netlib/TESTING/MATGEN/dlatm7.c index df5eccf845..eb42e2b8ba 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatm7.c +++ b/lapack-netlib/TESTING/MATGEN/dlatm7.c @@ -627,7 +627,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatm7_(integer *mode, doublereal *cond, integer *irsign, +/* Subroutine */ void dlatm7_(integer *mode, doublereal *cond, integer *irsign, integer *idist, integer *iseed, doublereal *d__, integer *n, integer *rank, integer *info) { @@ -640,7 +640,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ integer i__; doublereal alpha; extern doublereal dlaran_(integer *); - extern /* Subroutine */ int xerbla_(char *, integer *), dlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void dlarnv_( integer *, integer *, integer *, doublereal *); @@ -665,7 +666,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set INFO if an error */ @@ -685,8 +686,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); - xerbla_("DLATM7", &i__1); - return 0; + xerbla_("DLATM7", &i__1, 6); + return; } /* Compute D according to COND and MODE */ @@ -815,7 +816,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of DLATM7 */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatme.c b/lapack-netlib/TESTING/MATGEN/dlatme.c index c679007890..a92c70ef28 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatme.c +++ b/lapack-netlib/TESTING/MATGEN/dlatme.c @@ -841,7 +841,7 @@ static doublereal c_b39 = 1.; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatme_(integer *n, char *dist, integer *iseed, +/* Subroutine */ void dlatme_(integer *n, char *dist, integer *iseed, doublereal *d__, integer *mode, doublereal *cond, doublereal *dmax__, char *ei, char *rsign, char *upper, char *sim, doublereal *ds, integer *modes, doublereal *conds, integer *kl, integer *ku, @@ -854,7 +854,7 @@ static doublereal c_b39 = 1.; /* Local variables */ logical bads; - extern /* Subroutine */ int dger_(integer *, integer *, doublereal *, + extern /* Subroutine */ void dger_(integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, integer *); integer isim; @@ -862,10 +862,10 @@ static doublereal c_b39 = 1.; logical badei; integer i__, j; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); - extern /* Subroutine */ int dgemv_(char *, integer *, integer *, + extern /* Subroutine */ void dgemv_(char *, integer *, integer *, doublereal *, doublereal *, integer *, doublereal *, integer *, doublereal *, doublereal *, integer *); integer iinfo; @@ -873,22 +873,23 @@ static doublereal c_b39 = 1.; integer icols; logical useei; integer idist; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer irows; - extern /* Subroutine */ int dlatm1_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); integer ic, jc; extern doublereal dlange_(char *, integer *, integer *, doublereal *, integer *, doublereal *); integer ir, jr; - extern /* Subroutine */ int dlarge_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlarge_(integer *, doublereal *, integer *, integer *, doublereal *, integer *), dlarfg_(integer *, doublereal *, doublereal *, integer *, doublereal *); extern doublereal dlaran_(integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, - doublereal *, doublereal *, doublereal *, integer *), - xerbla_(char *, integer *), dlarnv_(integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, + doublereal *, doublereal *, doublereal *, integer *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlarnv_(integer *, integer *, integer *, doublereal *); integer irsign, iupper; doublereal xnorms; @@ -924,7 +925,7 @@ static doublereal c_b39 = 1.; /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1042,8 +1043,8 @@ static doublereal c_b39 = 1.; if (*info != 0) { i__1 = -(*info); - xerbla_("DLATME", &i__1); - return 0; + xerbla_("DLATME", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1064,7 +1065,7 @@ static doublereal c_b39 = 1.; dlatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], n, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && abs(*mode) != 6) { @@ -1083,7 +1084,7 @@ static doublereal c_b39 = 1.; alpha = *dmax__ / temp; } else if (*dmax__ != 0.) { *info = 2; - return 0; + return; } else { alpha = 0.; } @@ -1155,7 +1156,7 @@ static doublereal c_b39 = 1.; dlatm1_(modes, conds, &c__0, &c__0, &iseed[1], &ds[1], n, &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } /* Multiply by V and V' */ @@ -1163,7 +1164,7 @@ static doublereal c_b39 = 1.; dlarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } /* Multiply by S and (1/S) */ @@ -1176,7 +1177,7 @@ static doublereal c_b39 = 1.; dscal_(n, &d__1, &a[j * a_dim1 + 1], &c__1); } else { *info = 5; - return 0; + return; } /* L80: */ } @@ -1186,7 +1187,7 @@ static doublereal c_b39 = 1.; dlarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } } @@ -1274,7 +1275,7 @@ static doublereal c_b39 = 1.; } } - return 0; + return; /* End of DLATME */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatmr.c b/lapack-netlib/TESTING/MATGEN/dlatmr.c index c5d88ad298..e6ce5456db 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatmr.c +++ b/lapack-netlib/TESTING/MATGEN/dlatmr.c @@ -979,7 +979,7 @@ static integer c__1 = 1; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatmr_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void dlatmr_(integer *m, integer *n, char *dist, integer * iseed, char *sym, doublereal *d__, integer *mode, doublereal *cond, doublereal *dmax__, char *rsign, char *grade, doublereal *dl, integer *model, doublereal *condl, doublereal *dr, integer *moder, doublereal @@ -996,7 +996,7 @@ static integer c__1 = 1; doublereal temp; integer isym, i__, j, k; doublereal alpha; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer ipack; extern logical lsame_(char *, char *); @@ -1006,7 +1006,7 @@ static integer c__1 = 1; integer mnsub; doublereal onorm; integer mxsub, npvts; - extern /* Subroutine */ int dlatm1_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); extern doublereal dlatm2_(integer *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, doublereal *, integer @@ -1021,7 +1021,7 @@ static integer c__1 = 1; extern doublereal dlansb_(char *, char *, integer *, integer *, doublereal *, integer *, doublereal *); logical fulbnd; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical badpvt; extern doublereal dlansp_(char *, char *, integer *, doublereal *, doublereal *), dlansy_(char *, char *, integer *, @@ -1058,7 +1058,7 @@ static integer c__1 = 1; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1252,8 +1252,8 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); - xerbla_("DLATMR", &i__1); - return 0; + xerbla_("DLATMR", &i__1, 6); + return; } /* Decide if we can pivot consistently */ @@ -1279,7 +1279,7 @@ static integer c__1 = 1; dlatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, info); if (*info != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && *mode != -6 && *mode != 6) { @@ -1295,7 +1295,7 @@ static integer c__1 = 1; } if (temp == 0. && *dmax__ != 0.) { *info = 2; - return 0; + return; } if (temp != 0.) { alpha = *dmax__ / temp; @@ -1316,7 +1316,7 @@ static integer c__1 = 1; dlatm1_(model, condl, &c__0, &idist, &iseed[1], &dl[1], m, info); if (*info != 0) { *info = 3; - return 0; + return; } } @@ -1326,7 +1326,7 @@ static integer c__1 = 1; dlatm1_(moder, condr, &c__0, &idist, &iseed[1], &dr[1], n, info); if (*info != 0) { *info = 4; - return 0; + return; } } @@ -1815,7 +1815,7 @@ static integer c__1 = 1; /* Desired scaling impossible */ *info = 5; - return 0; + return; } else if (*anorm > 1. && onorm < 1. || *anorm < 1. && onorm > 1.) { @@ -1887,6 +1887,6 @@ static integer c__1 = 1; /* End of DLATMR */ - return 0; + return; } /* dlatmr_ */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatms.c b/lapack-netlib/TESTING/MATGEN/dlatms.c index 330bd279ab..d56d26a501 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatms.c +++ b/lapack-netlib/TESTING/MATGEN/dlatms.c @@ -833,7 +833,7 @@ static logical c_false = FALSE_; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatms_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void dlatms_(integer *m, integer *n, char *dist, integer * iseed, char *sym, doublereal *d__, integer *mode, doublereal *cond, doublereal *dmax__, integer *kl, integer *ku, char *pack, doublereal * a, integer *lda, doublereal *work, integer *info) @@ -851,27 +851,29 @@ static logical c_false = FALSE_; integer i__, j, k; doublereal s, alpha, angle; integer ipack; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer ioffg; extern logical lsame_(char *, char *); integer iinfo, idist, mnmin; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer iskew; doublereal extra, dummy; - extern /* Subroutine */ int dlatm1_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); integer ic, jc, nc; - extern /* Subroutine */ int dlagge_(integer *, integer *, integer *, + extern /* Subroutine */ void dlagge_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *); integer il, iendch, ir, jr, ipackg, mr, minlda; extern doublereal dlarnd_(integer *, integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *), dlagsy_( + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlagsy_( integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *), dlarot_(logical *, logical *, logical *, integer *, doublereal *, doublereal *, doublereal *, @@ -908,7 +910,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1043,8 +1045,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("DLATMS", &i__1); - return 0; + xerbla_("DLATMS", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1065,7 +1067,7 @@ static logical c_false = FALSE_; dlatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1094,7 +1096,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } dscal_(&mnmin, &alpha, &d__[1], &c__1); @@ -1723,7 +1725,7 @@ static logical c_false = FALSE_; } if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -1885,7 +1887,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of DLATMS */ diff --git a/lapack-netlib/TESTING/MATGEN/dlatmt.c b/lapack-netlib/TESTING/MATGEN/dlatmt.c index 5037405e31..b61e8b779c 100644 --- a/lapack-netlib/TESTING/MATGEN/dlatmt.c +++ b/lapack-netlib/TESTING/MATGEN/dlatmt.c @@ -843,7 +843,7 @@ static logical c_false = FALSE_; /* > \ingroup double_matgen */ /* ===================================================================== */ -/* Subroutine */ int dlatmt_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void dlatmt_(integer *m, integer *n, char *dist, integer * iseed, char *sym, doublereal *d__, integer *mode, doublereal *cond, doublereal *dmax__, integer *rank, integer *kl, integer *ku, char * pack, doublereal *a, integer *lda, doublereal *work, integer *info) @@ -861,28 +861,30 @@ static logical c_false = FALSE_; integer i__, j, k; doublereal s, alpha, angle; integer ipack; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); integer ioffg; extern logical lsame_(char *, char *); integer iinfo, idist, mnmin; - extern /* Subroutine */ int dcopy_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dcopy_(integer *, doublereal *, integer *, doublereal *, integer *); integer iskew; doublereal extra, dummy; - extern /* Subroutine */ int dlatm7_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlatm7_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *); integer ic, jc, nc; - extern /* Subroutine */ int dlagge_(integer *, integer *, integer *, + extern /* Subroutine */ void dlagge_(integer *, integer *, integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *); integer il, iendch, ir, jr, ipackg, mr, minlda; extern doublereal dlarnd_(integer *, integer *); - extern /* Subroutine */ int dlaset_(char *, integer *, integer *, + extern /* Subroutine */ void dlaset_(char *, integer *, integer *, doublereal *, doublereal *, doublereal *, integer *), dlartg_(doublereal *, doublereal *, doublereal *, doublereal *, - doublereal *), xerbla_(char *, integer *), dlagsy_( + doublereal *); + extern int xerbla_(char *, integer *, ftnlen); + extern void dlagsy_( integer *, integer *, doublereal *, doublereal *, integer *, integer *, doublereal *, integer *), dlarot_(logical *, logical *, logical *, integer *, doublereal *, doublereal *, doublereal *, @@ -918,7 +920,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1053,8 +1055,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("DLATMT", &i__1); - return 0; + xerbla_("DLATMT", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1076,7 +1078,7 @@ static logical c_false = FALSE_; iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1105,7 +1107,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } dscal_(rank, &alpha, &d__[1], &c__1); @@ -1734,7 +1736,7 @@ static logical c_false = FALSE_; } if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -1896,7 +1898,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of DLATMT */ diff --git a/lapack-netlib/TESTING/MATGEN/slagge.c b/lapack-netlib/TESTING/MATGEN/slagge.c index 9e3e47712c..c3ec8f2b8d 100644 --- a/lapack-netlib/TESTING/MATGEN/slagge.c +++ b/lapack-netlib/TESTING/MATGEN/slagge.c @@ -626,7 +626,7 @@ static real c_b13 = 0.f; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slagge_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void slagge_(integer *m, integer *n, integer *kl, integer *ku, real *d__, real *a, integer *lda, integer *iseed, real *work, integer *info) { @@ -635,15 +635,16 @@ static real c_b13 = 0.f; real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern real snrm2_(integer *, real *, integer *); integer i__, j; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real wa, wb, wn; - extern /* Subroutine */ int xerbla_(char *, integer *), slarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarnv_( integer *, integer *, integer *, real *); real tau; @@ -682,8 +683,8 @@ static real c_b13 = 0.f; } if (*info < 0) { i__1 = -(*info); - xerbla_("SLAGGE", &i__1); - return 0; + xerbla_("SLAGGE", &i__1, 6); + return; } /* initialize A to diagonal matrix */ @@ -706,7 +707,7 @@ static real c_b13 = 0.f; /* Quick exit if the user wants a diagonal matrix */ if (*kl == 0 && *ku == 0) { - return 0; + return; } /* pre- and post-multiply A by random orthogonal matrices */ @@ -961,7 +962,7 @@ static real c_b13 = 0.f; } /* L70: */ } - return 0; + return; /* End of SLAGGE */ diff --git a/lapack-netlib/TESTING/MATGEN/slagsy.c b/lapack-netlib/TESTING/MATGEN/slagsy.c index 5038e1c653..62614593d1 100644 --- a/lapack-netlib/TESTING/MATGEN/slagsy.c +++ b/lapack-netlib/TESTING/MATGEN/slagsy.c @@ -615,7 +615,7 @@ static real c_b26 = 1.f; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slagsy_(integer *n, integer *k, real *d__, real *a, +/* Subroutine */ void slagsy_(integer *n, integer *k, real *d__, real *a, integer *lda, integer *iseed, real *work, integer *info) { /* System generated locals */ @@ -623,22 +623,23 @@ static real c_b26 = 1.f; real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern real sdot_(integer *, real *, integer *, real *, integer *), snrm2_(integer *, real *, integer *); integer i__, j; - extern /* Subroutine */ int ssyr2_(char *, integer *, real *, real *, + extern /* Subroutine */ void ssyr2_(char *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); real alpha; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), saxpy_( integer *, real *, real *, integer *, real *, integer *), ssymv_( char *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real wa, wb, wn; - extern /* Subroutine */ int xerbla_(char *, integer *), slarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarnv_( integer *, integer *, integer *, real *); real tau; @@ -673,8 +674,8 @@ static real c_b26 = 1.f; } if (*info < 0) { i__1 = -(*info); - xerbla_("SLAGSY", &i__1); - return 0; + xerbla_("SLAGSY", &i__1, 6); + return; } /* initialize lower triangle of A to diagonal matrix */ @@ -818,7 +819,7 @@ static real c_b26 = 1.f; } /* L80: */ } - return 0; + return; /* End of SLAGSY */ diff --git a/lapack-netlib/TESTING/MATGEN/slahilb.c b/lapack-netlib/TESTING/MATGEN/slahilb.c index 786804ca30..ae633d3c97 100644 --- a/lapack-netlib/TESTING/MATGEN/slahilb.c +++ b/lapack-netlib/TESTING/MATGEN/slahilb.c @@ -635,7 +635,7 @@ static real c_b4 = 0.f; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slahilb_(integer *n, integer *nrhs, real *a, integer * +/* Subroutine */ void slahilb_(integer *n, integer *nrhs, real *a, integer * lda, real *x, integer *ldx, real *b, integer *ldb, real *work, integer *info) { @@ -645,7 +645,8 @@ static real c_b4 = 0.f; /* Local variables */ integer i__, j, m, r__, ti, tm; - extern /* Subroutine */ int xerbla_(char *, integer *), slaset_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slaset_( char *, integer *, integer *, real *, real *, real *, integer *); @@ -690,8 +691,8 @@ static real c_b4 = 0.f; } if (*info < 0) { i__1 = -(*info); - xerbla_("SLAHILB", &i__1); - return 0; + xerbla_("SLAHILB", &i__1, 7); + return; } if (*n > 6) { *info = 1; @@ -745,6 +746,6 @@ static real c_b4 = 0.f; } } - return 0; + return; } /* slahilb_ */ diff --git a/lapack-netlib/TESTING/MATGEN/slakf2.c b/lapack-netlib/TESTING/MATGEN/slakf2.c index 27efd82a80..67be11662c 100644 --- a/lapack-netlib/TESTING/MATGEN/slakf2.c +++ b/lapack-netlib/TESTING/MATGEN/slakf2.c @@ -615,7 +615,7 @@ static real c_b3 = 0.f; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slakf2_(integer *m, integer *n, real *a, integer *lda, +/* Subroutine */ void slakf2_(integer *m, integer *n, real *a, integer *lda, real *b, real *d__, real *e, real *z__, integer *ldz) { /* System generated locals */ @@ -624,7 +624,7 @@ static real c_b3 = 0.f; /* Local variables */ integer i__, j, l, ik, jk, mn; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer mn2; @@ -730,7 +730,7 @@ static real c_b3 = 0.f; /* L90: */ } - return 0; + return; /* End of SLAKF2 */ diff --git a/lapack-netlib/TESTING/MATGEN/slarge.c b/lapack-netlib/TESTING/MATGEN/slarge.c index 20453f08b6..6b37e94003 100644 --- a/lapack-netlib/TESTING/MATGEN/slarge.c +++ b/lapack-netlib/TESTING/MATGEN/slarge.c @@ -600,7 +600,7 @@ static real c_b10 = 0.f; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slarge_(integer *n, real *a, integer *lda, integer * +/* Subroutine */ void slarge_(integer *n, real *a, integer *lda, integer * iseed, real *work, integer *info) { /* System generated locals */ @@ -608,15 +608,16 @@ static real c_b10 = 0.f; real r__1; /* Local variables */ - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); extern real snrm2_(integer *, real *, integer *); integer i__; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real wa, wb, wn; - extern /* Subroutine */ int xerbla_(char *, integer *), slarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void slarnv_( integer *, integer *, integer *, real *); real tau; @@ -648,8 +649,8 @@ static real c_b10 = 0.f; } if (*info < 0) { i__1 = -(*info); - xerbla_("SLARGE", &i__1); - return 0; + xerbla_("SLARGE", &i__1, 6); + return; } /* pre- and post-multiply A by random orthogonal matrix */ @@ -695,7 +696,7 @@ static real c_b10 = 0.f; a_dim1 + 1], lda); /* L10: */ } - return 0; + return; /* End of SLARGE */ diff --git a/lapack-netlib/TESTING/MATGEN/slaror.c b/lapack-netlib/TESTING/MATGEN/slaror.c index 26db2e6450..48b532dfd8 100644 --- a/lapack-netlib/TESTING/MATGEN/slaror.c +++ b/lapack-netlib/TESTING/MATGEN/slaror.c @@ -659,7 +659,7 @@ static integer c__1 = 1; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slaror_(char *side, char *init, integer *m, integer *n, +/* Subroutine */ void slaror_(char *side, char *init, integer *m, integer *n, real *a, integer *lda, integer *iseed, real *x, integer *info) { /* System generated locals */ @@ -668,21 +668,21 @@ static integer c__1 = 1; /* Local variables */ integer kbeg, jcol; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer irow; extern real snrm2_(integer *, real *, integer *); integer j; extern logical lsame_(char *, char *); - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *), sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); integer ixfrm, itype, nxfrm; real xnorm; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); real factor; extern real slarnd_(integer *, integer *); - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); real xnorms; @@ -706,7 +706,7 @@ static integer c__1 = 1; /* Function Body */ *info = 0; if (*n == 0 || *m == 0) { - return 0; + return; } itype = 0; @@ -731,8 +731,8 @@ static integer c__1 = 1; } if (*info != 0) { i__1 = -(*info); - xerbla_("SLAROR", &i__1); - return 0; + xerbla_("SLAROR", &i__1, 6); + return; } if (itype == 1) { @@ -779,8 +779,8 @@ static integer c__1 = 1; factor = xnorms * (xnorms + x[kbeg]); if (abs(factor) < 1e-20f) { *info = 1; - xerbla_("SLAROR", info); - return 0; + xerbla_("SLAROR", info, 6); + return; } else { factor = 1.f / factor; } @@ -834,7 +834,7 @@ static integer c__1 = 1; /* L50: */ } } - return 0; + return; /* End of SLAROR */ diff --git a/lapack-netlib/TESTING/MATGEN/slarot.c b/lapack-netlib/TESTING/MATGEN/slarot.c index 205f7a622e..a23e186e19 100644 --- a/lapack-netlib/TESTING/MATGEN/slarot.c +++ b/lapack-netlib/TESTING/MATGEN/slarot.c @@ -737,7 +737,7 @@ static integer c__1 = 1; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slarot_(logical *lrows, logical *lleft, logical *lright, +/* Subroutine */ void slarot_(logical *lrows, logical *lleft, logical *lright, integer *nl, real *c__, real *s, real *a, integer *lda, real *xleft, real *xright) { @@ -746,11 +746,11 @@ static integer c__1 = 1; /* Local variables */ integer iinc; - extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + extern /* Subroutine */ void srot_(integer *, real *, integer *, real *, integer *, real *, real *); integer inext, ix, iy, nt; real xt[2], yt[2]; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer iyt; @@ -799,12 +799,12 @@ static integer c__1 = 1; /* Check for errors */ if (*nl < nt) { - xerbla_("SLAROT", &c__4); - return 0; + xerbla_("SLAROT", &c__4, 6); + return; } if (*lda <= 0 || ! (*lrows) && *lda < *nl - nt) { - xerbla_("SLAROT", &c__8); - return 0; + xerbla_("SLAROT", &c__8, 6); + return; } /* Rotate */ @@ -825,7 +825,7 @@ static integer c__1 = 1; a[iyt] = yt[nt - 1]; } - return 0; + return; /* End of SLAROT */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm1.c b/lapack-netlib/TESTING/MATGEN/slatm1.c index c5bb56d302..5c4cdc0677 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm1.c +++ b/lapack-netlib/TESTING/MATGEN/slatm1.c @@ -641,7 +641,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatm1_(integer *mode, real *cond, integer *irsign, +/* Subroutine */ void slatm1_(integer *mode, real *cond, integer *irsign, integer *idist, integer *iseed, real *d__, integer *n, integer *info) { /* System generated locals */ @@ -652,9 +652,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ real temp; integer i__; real alpha; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slaran_(integer *); - extern /* Subroutine */ int slarnv_(integer *, integer *, integer *, real + extern /* Subroutine */ void slarnv_(integer *, integer *, integer *, real *); @@ -679,7 +679,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set INFO if an error */ @@ -699,8 +699,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); - xerbla_("SLATM1", &i__1); - return 0; + xerbla_("SLATM1", &i__1, 6); + return; } /* Compute D according to COND and MODE */ @@ -815,7 +815,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of SLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm5.c b/lapack-netlib/TESTING/MATGEN/slatm5.c index 238bf7e97e..24ee0915d4 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm5.c +++ b/lapack-netlib/TESTING/MATGEN/slatm5.c @@ -778,7 +778,7 @@ static real c_b33 = -1.f; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int slatm5_(integer *prtype, integer *m, integer *n, real *a, +/* Subroutine */ void slatm5_(integer *prtype, integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, real *c__, integer *ldc, real * d__, integer *ldd, real *e, integer *lde, real *f, integer *ldf, real *r__, integer *ldr, real *l, integer *ldl, real *alpha, integer * @@ -791,7 +791,7 @@ static real c_b33 = -1.f; /* Local variables */ integer i__, j, k; - extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void sgemm_(char *, char *, integer *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *); real imeps, reeps; @@ -1091,6 +1091,6 @@ static real c_b33 = -1.f; /* End of SLATM5 */ - return 0; + return; } /* slatm5_ */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm6.c b/lapack-netlib/TESTING/MATGEN/slatm6.c index d69028e6cd..c50fcd29f2 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm6.c +++ b/lapack-netlib/TESTING/MATGEN/slatm6.c @@ -692,7 +692,7 @@ static integer c__60 = 60; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatm6_(integer *type__, integer *n, real *a, integer * +/* Subroutine */ void slatm6_(integer *type__, integer *n, real *a, integer * lda, real *b, real *x, integer *ldx, real *y, integer *ldy, real * alpha, real *beta, real *wx, real *wy, real *s, real *dif) { @@ -705,7 +705,7 @@ static integer c__60 = 60; real work[100]; integer i__, j; real z__[144] /* was [12][12] */; - extern /* Subroutine */ int slakf2_(integer *, integer *, real *, integer + extern /* Subroutine */ void slakf2_(integer *, integer *, real *, integer *, real *, real *, real *, real *, integer *), sgesvd_(char *, char *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, @@ -864,7 +864,7 @@ static integer c__60 = 60; } - return 0; + return; /* End of SLATM6 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatm7.c b/lapack-netlib/TESTING/MATGEN/slatm7.c index 21976e322f..e1b32ac9ff 100644 --- a/lapack-netlib/TESTING/MATGEN/slatm7.c +++ b/lapack-netlib/TESTING/MATGEN/slatm7.c @@ -627,7 +627,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatm7_(integer *mode, real *cond, integer *irsign, +/* Subroutine */ void slatm7_(integer *mode, real *cond, integer *irsign, integer *idist, integer *iseed, real *d__, integer *n, integer *rank, integer *info) { @@ -639,9 +639,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ real temp; integer i__; real alpha; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slaran_(integer *); - extern /* Subroutine */ int slarnv_(integer *, integer *, integer *, real + extern /* Subroutine */ void slarnv_(integer *, integer *, integer *, real *); @@ -666,7 +666,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set INFO if an error */ @@ -686,8 +686,8 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ if (*info != 0) { i__1 = -(*info); - xerbla_("SLATM7", &i__1); - return 0; + xerbla_("SLATM7", &i__1, 6); + return; } /* Compute D according to COND and MODE */ @@ -817,7 +817,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ } - return 0; + return; /* End of SLATM7 */ diff --git a/lapack-netlib/TESTING/MATGEN/slatme.c b/lapack-netlib/TESTING/MATGEN/slatme.c index 96178bee67..a8a6b39a33 100644 --- a/lapack-netlib/TESTING/MATGEN/slatme.c +++ b/lapack-netlib/TESTING/MATGEN/slatme.c @@ -841,7 +841,7 @@ static real c_b39 = 1.f; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatme_(integer *n, char *dist, integer *iseed, real * +/* Subroutine */ void slatme_(integer *n, char *dist, integer *iseed, real * d__, integer *mode, real *cond, real *dmax__, char *ei, char *rsign, char *upper, char *sim, real *ds, integer *modes, real *conds, integer *kl, integer *ku, real *anorm, real *a, integer *lda, real * @@ -853,7 +853,7 @@ static real c_b39 = 1.f; /* Local variables */ logical bads; - extern /* Subroutine */ int sger_(integer *, integer *, real *, real *, + extern /* Subroutine */ void sger_(integer *, integer *, real *, real *, integer *, real *, integer *, real *, integer *); integer isim; real temp; @@ -862,28 +862,29 @@ static real c_b39 = 1.f; real alpha; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); real tempa[1]; integer icols; logical useei; integer idist; - extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, + extern /* Subroutine */ void sgemv_(char *, integer *, integer *, real *, real *, integer *, real *, integer *, real *, real *, integer *), scopy_(integer *, real *, integer *, real *, integer *); integer irows; - extern /* Subroutine */ int slatm1_(integer *, real *, integer *, integer + extern /* Subroutine */ void slatm1_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); integer ic, jc, ir, jr; extern real slange_(char *, integer *, integer *, real *, integer *, real *); - extern /* Subroutine */ int slarge_(integer *, real *, integer *, integer + extern /* Subroutine */ void slarge_(integer *, real *, integer *, integer *, real *, integer *), slarfg_(integer *, real *, real *, integer - *, real *), xerbla_(char *, integer *); + *, real *); + extern int xerbla_(char *, integer *, ftnlen); extern real slaran_(integer *); integer irsign; - extern /* Subroutine */ int slaset_(char *, integer *, integer *, real *, + extern /* Subroutine */ void slaset_(char *, integer *, integer *, real *, real *, real *, integer *); integer iupper; - extern /* Subroutine */ int slarnv_(integer *, integer *, integer *, real + extern /* Subroutine */ void slarnv_(integer *, integer *, integer *, real *); real xnorms; integer jcr; @@ -918,7 +919,7 @@ static real c_b39 = 1.f; /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1036,8 +1037,8 @@ static real c_b39 = 1.f; if (*info != 0) { i__1 = -(*info); - xerbla_("SLATME", &i__1); - return 0; + xerbla_("SLATME", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1058,7 +1059,7 @@ static real c_b39 = 1.f; slatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], n, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && abs(*mode) != 6) { @@ -1077,7 +1078,7 @@ static real c_b39 = 1.f; alpha = *dmax__ / temp; } else if (*dmax__ != 0.f) { *info = 2; - return 0; + return; } else { alpha = 0.f; } @@ -1149,7 +1150,7 @@ static real c_b39 = 1.f; slatm1_(modes, conds, &c__0, &c__0, &iseed[1], &ds[1], n, &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } /* Multiply by V and V' */ @@ -1157,7 +1158,7 @@ static real c_b39 = 1.f; slarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } /* Multiply by S and (1/S) */ @@ -1170,7 +1171,7 @@ static real c_b39 = 1.f; sscal_(n, &r__1, &a[j * a_dim1 + 1], &c__1); } else { *info = 5; - return 0; + return; } /* L80: */ } @@ -1180,7 +1181,7 @@ static real c_b39 = 1.f; slarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } } @@ -1268,7 +1269,7 @@ static real c_b39 = 1.f; } } - return 0; + return; /* End of SLATME */ diff --git a/lapack-netlib/TESTING/MATGEN/slatmr.c b/lapack-netlib/TESTING/MATGEN/slatmr.c index 9d416d23f5..01de66f54d 100644 --- a/lapack-netlib/TESTING/MATGEN/slatmr.c +++ b/lapack-netlib/TESTING/MATGEN/slatmr.c @@ -979,7 +979,7 @@ static integer c__1 = 1; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatmr_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void slatmr_(integer *m, integer *n, char *dist, integer * iseed, char *sym, real *d__, integer *mode, real *cond, real *dmax__, char *rsign, char *grade, real *dl, integer *model, real *condl, real *dr, integer *moder, real *condr, char *pivtng, integer *ipivot, @@ -998,13 +998,13 @@ static integer c__1 = 1; integer ipack; extern logical lsame_(char *, char *); real tempa[1]; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer iisub, idist, jjsub, mnmin; logical dzero; integer mnsub; real onorm; integer mxsub, npvts; - extern /* Subroutine */ int slatm1_(integer *, real *, integer *, integer + extern /* Subroutine */ void slatm1_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); extern real slatm2_(integer *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, real *, integer *, real *, real @@ -1017,7 +1017,7 @@ static integer c__1 = 1; integer *, real *), slange_(char *, integer *, integer *, real *, integer *, real *); logical fulbnd; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical badpvt; extern real slansb_(char *, char *, integer *, integer *, real *, integer *, real *); @@ -1057,7 +1057,7 @@ static integer c__1 = 1; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1251,8 +1251,8 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); - xerbla_("SLATMR", &i__1); - return 0; + xerbla_("SLATMR", &i__1, 6); + return; } /* Decide if we can pivot consistently */ @@ -1278,7 +1278,7 @@ static integer c__1 = 1; slatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, info); if (*info != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && *mode != -6 && *mode != 6) { @@ -1294,7 +1294,7 @@ static integer c__1 = 1; } if (temp == 0.f && *dmax__ != 0.f) { *info = 2; - return 0; + return; } if (temp != 0.f) { alpha = *dmax__ / temp; @@ -1315,7 +1315,7 @@ static integer c__1 = 1; slatm1_(model, condl, &c__0, &idist, &iseed[1], &dl[1], m, info); if (*info != 0) { *info = 3; - return 0; + return; } } @@ -1325,7 +1325,7 @@ static integer c__1 = 1; slatm1_(moder, condr, &c__0, &idist, &iseed[1], &dr[1], n, info); if (*info != 0) { *info = 4; - return 0; + return; } } @@ -1814,7 +1814,7 @@ static integer c__1 = 1; /* Desired scaling impossible */ *info = 5; - return 0; + return; } else if (*anorm > 1.f && onorm < 1.f || *anorm < 1.f && onorm > 1.f) { @@ -1887,6 +1887,6 @@ static integer c__1 = 1; /* End of SLATMR */ - return 0; + return; } /* slatmr_ */ diff --git a/lapack-netlib/TESTING/MATGEN/slatms.c b/lapack-netlib/TESTING/MATGEN/slatms.c index ade2391cca..8e85f5db8a 100644 --- a/lapack-netlib/TESTING/MATGEN/slatms.c +++ b/lapack-netlib/TESTING/MATGEN/slatms.c @@ -833,7 +833,7 @@ static logical c_false = FALSE_; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatms_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void slatms_(integer *m, integer *n, char *dist, integer * iseed, char *sym, real *d__, integer *mode, real *cond, real *dmax__, integer *kl, integer *ku, char *pack, real *a, integer *lda, real * work, integer *info) @@ -853,22 +853,22 @@ static logical c_false = FALSE_; integer ipack, ioffg; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer idist, mnmin, iskew; real extra, dummy; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slatm1_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *); integer ic, jc, nc, il, iendch, ir, jr, ipackg, mr; - extern /* Subroutine */ int slagge_(integer *, integer *, integer *, + extern /* Subroutine */ void slagge_(integer *, integer *, integer *, integer *, real *, real *, integer *, integer *, real *, integer * ); integer minlda; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slarnd_(integer *, integer *); logical iltemp, givens; integer ioffst, irsign; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slagsy_(integer *, integer *, real *, real *, integer *, integer *, real *, integer *), slarot_(logical *, @@ -904,7 +904,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1039,8 +1039,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("SLATMS", &i__1); - return 0; + xerbla_("SLATMS", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1061,7 +1061,7 @@ static logical c_false = FALSE_; slatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1090,7 +1090,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } sscal_(&mnmin, &alpha, &d__[1], &c__1); @@ -1719,7 +1719,7 @@ static logical c_false = FALSE_; } if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -1881,7 +1881,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of SLATMS */ diff --git a/lapack-netlib/TESTING/MATGEN/slatmt.c b/lapack-netlib/TESTING/MATGEN/slatmt.c index b3185363b4..e34bb8c03e 100644 --- a/lapack-netlib/TESTING/MATGEN/slatmt.c +++ b/lapack-netlib/TESTING/MATGEN/slatmt.c @@ -843,7 +843,7 @@ static logical c_false = FALSE_; /* > \ingroup real_matgen */ /* ===================================================================== */ -/* Subroutine */ int slatmt_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void slatmt_(integer *m, integer *n, char *dist, integer * iseed, char *sym, real *d__, integer *mode, real *cond, real *dmax__, integer *rank, integer *kl, integer *ku, char *pack, real *a, integer *lda, real *work, integer *info) @@ -863,22 +863,22 @@ static logical c_false = FALSE_; integer ipack, ioffg; extern logical lsame_(char *, char *); integer iinfo; - extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *); + extern /* Subroutine */ void sscal_(integer *, real *, real *, integer *); integer idist, mnmin, iskew; real extra, dummy; - extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + extern /* Subroutine */ void scopy_(integer *, real *, integer *, real *, integer *), slatm7_(integer *, real *, integer *, integer *, integer *, real *, integer *, integer *, integer *); integer ic, jc, nc, il, iendch, ir, jr, ipackg, mr; - extern /* Subroutine */ int slagge_(integer *, integer *, integer *, + extern /* Subroutine */ void slagge_(integer *, integer *, integer *, integer *, real *, real *, integer *, integer *, real *, integer * ); integer minlda; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern real slarnd_(integer *, integer *); integer ioffst, irsign; logical givens, iltemp; - extern /* Subroutine */ int slartg_(real *, real *, real *, real *, real * + extern /* Subroutine */ void slartg_(real *, real *, real *, real *, real * ), slaset_(char *, integer *, integer *, real *, real *, real *, integer *), slagsy_(integer *, integer *, real *, real *, integer *, integer *, real *, integer *), slarot_(logical *, @@ -914,7 +914,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1049,8 +1049,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("SLATMT", &i__1); - return 0; + xerbla_("SLATMT", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1072,7 +1072,7 @@ static logical c_false = FALSE_; iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1101,7 +1101,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } sscal_(rank, &alpha, &d__[1], &c__1); @@ -1730,7 +1730,7 @@ static logical c_false = FALSE_; } if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -1892,7 +1892,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of SLATMT */ diff --git a/lapack-netlib/TESTING/MATGEN/zlagge.c b/lapack-netlib/TESTING/MATGEN/zlagge.c index 70651f8502..bfa33cf345 100644 --- a/lapack-netlib/TESTING/MATGEN/zlagge.c +++ b/lapack-netlib/TESTING/MATGEN/zlagge.c @@ -628,7 +628,7 @@ static integer c__1 = 1; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlagge_(integer *m, integer *n, integer *kl, integer *ku, +/* Subroutine */ void zlagge_(integer *m, integer *n, integer *kl, integer *ku, doublereal *d__, doublecomplex *a, integer *lda, integer *iseed, doublecomplex *work, integer *info) { @@ -639,7 +639,7 @@ static integer c__1 = 1; /* Local variables */ integer i__, j; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, @@ -648,7 +648,8 @@ static integer c__1 = 1; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); doublecomplex wa, wb; doublereal wn; - extern /* Subroutine */ int xerbla_(char *, integer *), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *), zlarnv_(integer *, integer *, integer *, doublecomplex *); doublecomplex tau; @@ -688,8 +689,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("ZLAGGE", &i__1); - return 0; + xerbla_("ZLAGGE", &i__1, 6); + return; } /* initialize A to diagonal matrix */ @@ -715,7 +716,7 @@ static integer c__1 = 1; /* Quick exit if the user wants a diagonal matrix */ if (*kl == 0 && *ku == 0) { - return 0; + return; } /* pre- and post-multiply A by random unitary matrices */ @@ -1026,7 +1027,7 @@ static integer c__1 = 1; } /* L70: */ } - return 0; + return; /* End of ZLAGGE */ diff --git a/lapack-netlib/TESTING/MATGEN/zlaghe.c b/lapack-netlib/TESTING/MATGEN/zlaghe.c index b781dc9949..06a8e31a3a 100644 --- a/lapack-netlib/TESTING/MATGEN/zlaghe.c +++ b/lapack-netlib/TESTING/MATGEN/zlaghe.c @@ -616,7 +616,7 @@ static integer c__1 = 1; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlaghe_(integer *n, integer *k, doublereal *d__, +/* Subroutine */ void zlaghe_(integer *n, integer *k, doublereal *d__, doublecomplex *a, integer *lda, integer *iseed, doublecomplex *work, integer *info) { @@ -626,18 +626,18 @@ static integer c__1 = 1; doublecomplex z__1, z__2, z__3, z__4; /* Local variables */ - extern /* Subroutine */ int zher2_(char *, integer *, doublecomplex *, + extern /* Subroutine */ void zher2_(char *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer i__, j; doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zscal_(integer *, doublecomplex *, doublecomplex *, integer *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zhemv_(char *, integer *, doublecomplex *, doublecomplex *, @@ -648,7 +648,8 @@ static integer c__1 = 1; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); doublecomplex wa, wb; doublereal wn; - extern /* Subroutine */ int xerbla_(char *, integer *), zlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarnv_( integer *, integer *, integer *, doublecomplex *); doublecomplex tau; @@ -683,8 +684,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("ZLAGHE", &i__1); - return 0; + xerbla_("ZLAGHE", &i__1, 6); + return; } /* initialize lower triangle of A to diagonal matrix */ @@ -862,7 +863,7 @@ static integer c__1 = 1; } /* L80: */ } - return 0; + return; /* End of ZLAGHE */ diff --git a/lapack-netlib/TESTING/MATGEN/zlagsy.c b/lapack-netlib/TESTING/MATGEN/zlagsy.c index 7f29344a92..431522ebec 100644 --- a/lapack-netlib/TESTING/MATGEN/zlagsy.c +++ b/lapack-netlib/TESTING/MATGEN/zlagsy.c @@ -616,7 +616,7 @@ static integer c__1 = 1; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlagsy_(integer *n, integer *k, doublereal *d__, +/* Subroutine */ void zlagsy_(integer *n, integer *k, doublereal *d__, doublecomplex *a, integer *lda, integer *iseed, doublecomplex *work, integer *info) { @@ -629,13 +629,13 @@ static integer c__1 = 1; /* Local variables */ integer i__, j; doublecomplex alpha; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zscal_(integer *, doublecomplex *, doublecomplex *, integer *); extern /* Double Complex */ VOID zdotc_(doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *), zaxpy_(integer *, doublecomplex *, doublecomplex *, integer *, @@ -646,7 +646,8 @@ static integer c__1 = 1; integer ii, jj; doublecomplex wa, wb; doublereal wn; - extern /* Subroutine */ int xerbla_(char *, integer *), zlacgv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlacgv_( integer *, doublecomplex *, integer *), zlarnv_(integer *, integer *, integer *, doublecomplex *); doublecomplex tau; @@ -682,8 +683,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("ZLAGSY", &i__1); - return 0; + xerbla_("ZLAGSY", &i__1, 6); + return; } /* initialize lower triangle of A to diagonal matrix */ @@ -913,7 +914,7 @@ static integer c__1 = 1; } /* L120: */ } - return 0; + return; /* End of ZLAGSY */ diff --git a/lapack-netlib/TESTING/MATGEN/zlahilb.c b/lapack-netlib/TESTING/MATGEN/zlahilb.c index c08c751969..0e9cc1c4a4 100644 --- a/lapack-netlib/TESTING/MATGEN/zlahilb.c +++ b/lapack-netlib/TESTING/MATGEN/zlahilb.c @@ -646,7 +646,7 @@ static doublecomplex c_b6 = {0.,0.}; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlahilb_(integer *n, integer *nrhs, doublecomplex *a, +/* Subroutine */ void zlahilb_(integer *n, integer *nrhs, doublecomplex *a, integer *lda, doublecomplex *x, integer *ldx, doublecomplex *b, integer *ldb, doublereal *work, integer *info, char *path) { @@ -671,9 +671,9 @@ static doublecomplex c_b6 = {0.,0.}; integer i__, j, m, r__; char c2[2]; integer ti, tm; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern logical lsamen_(integer *, char *, char *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublecomplex tmp; @@ -723,8 +723,8 @@ static doublecomplex c_b6 = {0.,0.}; } if (*info < 0) { i__1 = -(*info); - xerbla_("ZLAHILB", &i__1); - return 0; + xerbla_("ZLAHILB", &i__1, 7); + return; } if (*n > 6) { *info = 1; @@ -831,6 +831,6 @@ static doublecomplex c_b6 = {0.,0.}; } } } - return 0; + return; } /* zlahilb_ */ diff --git a/lapack-netlib/TESTING/MATGEN/zlakf2.c b/lapack-netlib/TESTING/MATGEN/zlakf2.c index ea23907530..f7a90eef7f 100644 --- a/lapack-netlib/TESTING/MATGEN/zlakf2.c +++ b/lapack-netlib/TESTING/MATGEN/zlakf2.c @@ -616,7 +616,7 @@ static doublecomplex c_b1 = {0.,0.}; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlakf2_(integer *m, integer *n, doublecomplex *a, +/* Subroutine */ void zlakf2_(integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, doublecomplex *d__, doublecomplex *e, doublecomplex *z__, integer *ldz) { @@ -627,7 +627,7 @@ static doublecomplex c_b1 = {0.,0.}; /* Local variables */ integer i__, j, l, ik, jk, mn; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer mn2; @@ -739,7 +739,7 @@ static doublecomplex c_b1 = {0.,0.}; /* L90: */ } - return 0; + return; /* End of ZLAKF2 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlarge.c b/lapack-netlib/TESTING/MATGEN/zlarge.c index 9084df6acf..4b06a818d3 100644 --- a/lapack-netlib/TESTING/MATGEN/zlarge.c +++ b/lapack-netlib/TESTING/MATGEN/zlarge.c @@ -601,7 +601,7 @@ static integer c__1 = 1; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlarge_(integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zlarge_(integer *n, doublecomplex *a, integer *lda, integer *iseed, doublecomplex *work, integer *info) { /* System generated locals */ @@ -611,7 +611,7 @@ static integer c__1 = 1; /* Local variables */ integer i__; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, @@ -620,7 +620,8 @@ static integer c__1 = 1; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); doublecomplex wa, wb; doublereal wn; - extern /* Subroutine */ int xerbla_(char *, integer *), zlarnv_( + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); + extern void zlarnv_( integer *, integer *, integer *, doublecomplex *); doublecomplex tau; @@ -652,8 +653,8 @@ static integer c__1 = 1; } if (*info < 0) { i__1 = -(*info); - xerbla_("ZLARGE", &i__1); - return 0; + xerbla_("ZLARGE", &i__1, 6); + return; } /* pre- and post-multiply A by random unitary matrix */ @@ -704,7 +705,7 @@ static integer c__1 = 1; * a_dim1 + 1], lda); /* L10: */ } - return 0; + return; /* End of ZLARGE */ diff --git a/lapack-netlib/TESTING/MATGEN/zlaror.c b/lapack-netlib/TESTING/MATGEN/zlaror.c index eeb9f4fcfe..6ada57b8a1 100644 --- a/lapack-netlib/TESTING/MATGEN/zlaror.c +++ b/lapack-netlib/TESTING/MATGEN/zlaror.c @@ -673,7 +673,7 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlaror_(char *side, char *init, integer *m, integer *n, +/* Subroutine */ void zlaror_(char *side, char *init, integer *m, integer *n, doublecomplex *a, integer *lda, integer *iseed, doublecomplex *x, integer *info) { @@ -687,25 +687,25 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t integer irow, j; extern logical lsame_(char *, char *); doublecomplex csign; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *), zscal_(integer *, doublecomplex *, doublecomplex *, integer *); integer ixfrm; - extern /* Subroutine */ int zgemv_(char *, integer *, integer *, + extern /* Subroutine */ void zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer itype, nxfrm; doublereal xnorm; extern doublereal dznrm2_(integer *, doublecomplex *, integer *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); doublereal factor; - extern /* Subroutine */ int zlacgv_(integer *, doublecomplex *, integer *) + extern /* Subroutine */ void zlacgv_(integer *, doublecomplex *, integer *) ; //extern /* Double Complex */ VOID zlarnd_(doublecomplex *, integer *, extern doublecomplex zlarnd_(integer *, integer *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); doublecomplex xnorms; @@ -729,7 +729,7 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t /* Function Body */ *info = 0; if (*n == 0 || *m == 0) { - return 0; + return; } itype = 0; @@ -756,8 +756,8 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t } if (*info != 0) { i__1 = -(*info); - xerbla_("ZLAROR", &i__1); - return 0; + xerbla_("ZLAROR", &i__1, 6); + return; } if (itype == 1) { @@ -821,8 +821,8 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t if (abs(factor) < 1e-20) { *info = 1; i__2 = -(*info); - xerbla_("ZLAROR", &i__2); - return 0; + xerbla_("ZLAROR", &i__2, 6); + return; } else { factor = 1. / factor; } @@ -905,7 +905,7 @@ t by U and the right by UC> SIDE = 'T' Multiply A on the left by U and t /* L60: */ } } - return 0; + return; /* End of ZLAROR */ diff --git a/lapack-netlib/TESTING/MATGEN/zlarot.c b/lapack-netlib/TESTING/MATGEN/zlarot.c index da7207df2c..6b2e88b325 100644 --- a/lapack-netlib/TESTING/MATGEN/zlarot.c +++ b/lapack-netlib/TESTING/MATGEN/zlarot.c @@ -740,7 +740,7 @@ static integer c__8 = 8; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlarot_(logical *lrows, logical *lleft, logical *lright, +/* Subroutine */ void zlarot_(logical *lrows, logical *lleft, logical *lright, integer *nl, doublecomplex *c__, doublecomplex *s, doublecomplex *a, integer *lda, doublecomplex *xleft, doublecomplex *xright) { @@ -753,7 +753,7 @@ static integer c__8 = 8; doublecomplex tempx; integer ix, iy, nt; doublecomplex xt[2], yt[2]; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); integer iyt; @@ -805,12 +805,12 @@ static integer c__8 = 8; /* Check for errors */ if (*nl < nt) { - xerbla_("ZLAROT", &c__4); - return 0; + xerbla_("ZLAROT", &c__4, 6); + return; } if (*lda <= 0 || ! (*lrows) && *lda < *nl - nt) { - xerbla_("ZLAROT", &c__8); - return 0; + xerbla_("ZLAROT", &c__8, 6); + return; } /* Rotate */ @@ -888,7 +888,7 @@ static integer c__8 = 8; a[i__1].r = yt[i__2].r, a[i__1].i = yt[i__2].i; } - return 0; + return; /* End of ZLAROT */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm1.c b/lapack-netlib/TESTING/MATGEN/zlatm1.c index 4b84fcc595..c75787730d 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm1.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm1.c @@ -648,7 +648,7 @@ static integer c__3 = 3; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlatm1_(integer *mode, doublereal *cond, integer *irsign, +/* Subroutine */ void zlatm1_(integer *mode, doublereal *cond, integer *irsign, integer *idist, integer *iseed, doublecomplex *d__, integer *n, integer *info) { @@ -663,11 +663,11 @@ static integer c__3 = 3; doublereal alpha; doublecomplex ctemp; extern doublereal dlaran_(integer *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); //extern /* Double Complex */ VOID zlarnd_(doublecomplex *, integer *, extern doublecomplex zlarnd_(integer *, integer *); - extern /* Subroutine */ int zlarnv_(integer *, integer *, integer *, + extern /* Subroutine */ void zlarnv_(integer *, integer *, integer *, doublecomplex *); @@ -692,7 +692,7 @@ static integer c__3 = 3; /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Set INFO if an error */ @@ -712,8 +712,8 @@ static integer c__3 = 3; if (*info != 0) { i__1 = -(*info); - xerbla_("ZLATM1", &i__1); - return 0; + xerbla_("ZLATM1", &i__1, 6); + return; } /* Compute D according to COND and MODE */ @@ -848,7 +848,7 @@ static integer c__3 = 3; } - return 0; + return; /* End of ZLATM1 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm5.c b/lapack-netlib/TESTING/MATGEN/zlatm5.c index 047a633c44..753ee0ce68 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm5.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm5.c @@ -780,7 +780,7 @@ static doublecomplex c_b5 = {20.,0.}; /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ int zlatm5_(integer *prtype, integer *m, integer *n, +/* Subroutine */ void zlatm5_(integer *prtype, integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, doublecomplex *c__, integer *ldc, doublecomplex *d__, integer *ldd, doublecomplex *e, integer *lde, doublecomplex *f, integer *ldf, @@ -797,7 +797,7 @@ static doublecomplex c_b5 = {20.,0.}; /* Local variables */ integer i__, j, k; doublecomplex imeps, reeps; - extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, + extern /* Subroutine */ void zgemm_(char *, char *, integer *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); @@ -1281,6 +1281,6 @@ static doublecomplex c_b5 = {20.,0.}; /* End of ZLATM5 */ - return 0; + return; } /* zlatm5_ */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatm6.c b/lapack-netlib/TESTING/MATGEN/zlatm6.c index 6e25e50231..e20ec7092e 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatm6.c +++ b/lapack-netlib/TESTING/MATGEN/zlatm6.c @@ -687,7 +687,7 @@ static integer c__24 = 24; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlatm6_(integer *type__, integer *n, doublecomplex *a, +/* Subroutine */ void zlatm6_(integer *type__, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, doublecomplex *x, integer *ldx, doublecomplex *y, integer *ldy, doublecomplex *alpha, doublecomplex * beta, doublecomplex *wx, doublecomplex *wy, doublereal *s, doublereal @@ -705,7 +705,7 @@ static integer c__24 = 24; integer i__, j; doublecomplex z__[64] /* was [8][8] */; doublereal rwork[50]; - extern /* Subroutine */ int zlakf2_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zlakf2_(integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zgesvd_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *, @@ -934,7 +934,7 @@ static integer c__24 = 24; &c__1, &work[2], &c__24, &rwork[8], &info); dif[5] = rwork[7]; - return 0; + return; /* End of ZLATM6 */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatme.c b/lapack-netlib/TESTING/MATGEN/zlatme.c index 3e4c1f3a29..ded6ca2145 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatme.c +++ b/lapack-netlib/TESTING/MATGEN/zlatme.c @@ -812,7 +812,7 @@ static integer c__5 = 5; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlatme_(integer *n, char *dist, integer *iseed, +/* Subroutine */ void zlatme_(integer *n, char *dist, integer *iseed, doublecomplex *d__, integer *mode, doublereal *cond, doublecomplex * dmax__, char *rsign, char *upper, char *sim, doublereal *ds, integer * modes, doublereal *conds, integer *kl, integer *ku, doublereal *anorm, @@ -833,26 +833,26 @@ static integer c__5 = 5; integer iinfo; doublereal tempa[1]; integer icols; - extern /* Subroutine */ int zgerc_(integer *, integer *, doublecomplex *, + extern /* Subroutine */ void zgerc_(integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); integer idist; - extern /* Subroutine */ int zscal_(integer *, doublecomplex *, + extern /* Subroutine */ void zscal_(integer *, doublecomplex *, doublecomplex *, integer *), zgemv_(char *, integer *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *); integer irows; - extern /* Subroutine */ int zcopy_(integer *, doublecomplex *, integer *, + extern /* Subroutine */ void zcopy_(integer *, doublecomplex *, integer *, doublecomplex *, integer *), dlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *), zlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublecomplex *, integer *, integer *); integer ic, jc, ir; doublereal ralpha; - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *), zlarge_(integer *, doublecomplex *, integer *, integer *, doublecomplex *, integer *), zlarfg_( integer *, doublecomplex *, doublecomplex *, integer *, @@ -861,10 +861,10 @@ static integer c__5 = 5; extern doublecomplex zlarnd_(integer *, integer *); integer irsign; - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *); integer iupper; - extern /* Subroutine */ int zlarnv_(integer *, integer *, integer *, + extern /* Subroutine */ void zlarnv_(integer *, integer *, integer *, doublecomplex *); doublecomplex xnorms; integer jcr; @@ -898,7 +898,7 @@ static integer c__5 = 5; /* Quick return if possible */ if (*n == 0) { - return 0; + return; } /* Decode DIST */ @@ -990,8 +990,8 @@ static integer c__5 = 5; if (*info != 0) { i__1 = -(*info); - xerbla_("ZLATME", &i__1); - return 0; + xerbla_("ZLATME", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1012,7 +1012,7 @@ static integer c__5 = 5; zlatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], n, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && abs(*mode) != 6) { @@ -1032,7 +1032,7 @@ static integer c__5 = 5; alpha.r = z__1.r, alpha.i = z__1.i; } else { *info = 2; - return 0; + return; } zscal_(n, &alpha, &d__[1], &c__1); @@ -1069,7 +1069,7 @@ static integer c__5 = 5; dlatm1_(modes, conds, &c__0, &c__0, &iseed[1], &ds[1], n, &iinfo); if (iinfo != 0) { *info = 3; - return 0; + return; } /* Multiply by V and V' */ @@ -1077,7 +1077,7 @@ static integer c__5 = 5; zlarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } /* Multiply by S and (1/S) */ @@ -1090,7 +1090,7 @@ static integer c__5 = 5; zdscal_(n, &d__1, &a[j * a_dim1 + 1], &c__1); } else { *info = 5; - return 0; + return; } /* L50: */ } @@ -1100,7 +1100,7 @@ static integer c__5 = 5; zlarge_(n, &a[a_offset], lda, &iseed[1], &work[1], &iinfo); if (iinfo != 0) { *info = 4; - return 0; + return; } } @@ -1214,7 +1214,7 @@ static integer c__5 = 5; } } - return 0; + return; /* End of ZLATME */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatmr.c b/lapack-netlib/TESTING/MATGEN/zlatmr.c index c0c2d0b3b0..c42a25f2ba 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatmr.c +++ b/lapack-netlib/TESTING/MATGEN/zlatmr.c @@ -999,7 +999,7 @@ static integer c__1 = 1; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlatmr_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void zlatmr_(integer *m, integer *n, char *dist, integer * iseed, char *sym, doublecomplex *d__, integer *mode, doublereal *cond, doublecomplex *dmax__, char *rsign, char *grade, doublecomplex *dl, integer *model, doublereal *condl, doublecomplex *dr, integer *moder, @@ -1024,7 +1024,7 @@ static integer c__1 = 1; integer mnsub; doublereal onorm; integer mxsub, npvts; - extern /* Subroutine */ int zlatm1_(integer *, doublereal *, integer *, + extern /* Subroutine */ void zlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublecomplex *, integer *, integer *); extern /* Double Complex */ VOID zlatm2_(doublecomplex *, integer *, integer *, integer *, integer *, integer *, integer *, integer *, @@ -1039,11 +1039,11 @@ static integer c__1 = 1; logical fulbnd; extern doublereal zlangb_(char *, integer *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int xerbla_(char *, integer *); + extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); logical badpvt; extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, integer *, doublereal *); - extern /* Subroutine */ int zdscal_(integer *, doublereal *, + extern /* Subroutine */ void zdscal_(integer *, doublereal *, doublecomplex *, integer *); extern doublereal zlansb_(char *, char *, integer *, integer *, doublecomplex *, integer *, doublereal *); @@ -1083,7 +1083,7 @@ static integer c__1 = 1; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1284,8 +1284,8 @@ static integer c__1 = 1; if (*info != 0) { i__1 = -(*info); - xerbla_("ZLATMR", &i__1); - return 0; + xerbla_("ZLATMR", &i__1, 6); + return; } /* Decide if we can pivot consistently */ @@ -1311,7 +1311,7 @@ static integer c__1 = 1; zlatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, info); if (*info != 0) { *info = 1; - return 0; + return; } if (*mode != 0 && *mode != -6 && *mode != 6) { @@ -1327,7 +1327,7 @@ static integer c__1 = 1; } if (temp == 0. && (dmax__->r != 0. || dmax__->i != 0.)) { *info = 2; - return 0; + return; } if (temp != 0.) { z__1.r = dmax__->r / temp, z__1.i = dmax__->i / temp; @@ -1367,7 +1367,7 @@ static integer c__1 = 1; zlatm1_(model, condl, &c__0, &idist, &iseed[1], &dl[1], m, info); if (*info != 0) { *info = 3; - return 0; + return; } } @@ -1377,7 +1377,7 @@ static integer c__1 = 1; zlatm1_(moder, condr, &c__0, &idist, &iseed[1], &dr[1], n, info); if (*info != 0) { *info = 4; - return 0; + return; } } @@ -2032,7 +2032,7 @@ static integer c__1 = 1; /* Desired scaling impossible */ *info = 5; - return 0; + return; } else if (*anorm > 1. && onorm < 1. || *anorm < 1. && onorm > 1.) { @@ -2104,6 +2104,6 @@ static integer c__1 = 1; /* End of ZLATMR */ - return 0; + return; } /* zlatmr_ */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatms.c b/lapack-netlib/TESTING/MATGEN/zlatms.c index ca0191a135..f0bb66a441 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatms.c +++ b/lapack-netlib/TESTING/MATGEN/zlatms.c @@ -846,7 +846,7 @@ static logical c_false = FALSE_; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlatms_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void zlatms_(integer *m, integer *n, char *dist, integer * iseed, char *sym, doublereal *d__, integer *mode, doublereal *cond, doublereal *dmax__, integer *kl, integer *ku, char *pack, doublecomplex *a, integer *lda, doublecomplex *work, integer *info) @@ -869,40 +869,41 @@ static logical c_false = FALSE_; integer ipack; doublereal realc; integer ioffg; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iinfo; doublecomplex ctemp; integer idist, mnmin, iskew; doublecomplex extra, dummy; - extern /* Subroutine */ int dlatm1_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlatm1_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *); integer ic, jc, nc, il; doublecomplex ct; integer iendch, ir, jr, ipackg, mr, minlda; extern doublereal dlarnd_(integer *, integer *); doublecomplex st; - extern /* Subroutine */ int zlagge_(integer *, integer *, integer *, + extern /* Subroutine */ void zlagge_(integer *, integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, doublecomplex *, integer *), zlaghe_(integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, - doublecomplex *, integer *), xerbla_(char *, integer *); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); logical iltemp, givens; integer ioffst, irsign; //extern /* Double Complex */ VOID zlarnd_(doublecomplex *, integer *, extern doublecomplex zlarnd_(integer *, integer *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); logical ilextr; - extern /* Subroutine */ int zlagsy_(integer *, integer *, doublereal *, + extern /* Subroutine */ void zlagsy_(integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, doublecomplex *, integer *) ; logical topdwn; integer ir1, ir2, isympk; - extern /* Subroutine */ int zlarot_(logical *, logical *, logical *, + extern /* Subroutine */ void zlarot_(logical *, logical *, logical *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *); integer jch, llb, jkl, jku, uub; @@ -934,7 +935,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1073,8 +1074,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("ZLATMS", &i__1); - return 0; + xerbla_("ZLATMS", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1095,7 +1096,7 @@ static logical c_false = FALSE_; dlatm1_(mode, cond, &irsign, &idist, &iseed[1], &d__[1], &mnmin, &iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1124,7 +1125,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } dscal_(&mnmin, &alpha, &d__[1], &c__1); @@ -2038,7 +2039,7 @@ static logical c_false = FALSE_; if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -2213,7 +2214,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of ZLATMS */ diff --git a/lapack-netlib/TESTING/MATGEN/zlatmt.c b/lapack-netlib/TESTING/MATGEN/zlatmt.c index 47279858b5..551b17628c 100644 --- a/lapack-netlib/TESTING/MATGEN/zlatmt.c +++ b/lapack-netlib/TESTING/MATGEN/zlatmt.c @@ -854,7 +854,7 @@ static logical c_false = FALSE_; /* > \ingroup complex16_matgen */ /* ===================================================================== */ -/* Subroutine */ int zlatmt_(integer *m, integer *n, char *dist, integer * +/* Subroutine */ void zlatmt_(integer *m, integer *n, char *dist, integer * iseed, char *sym, doublereal *d__, integer *mode, doublereal *cond, doublereal *dmax__, integer *rank, integer *kl, integer *ku, char * pack, doublecomplex *a, integer *lda, doublecomplex *work, integer * @@ -876,14 +876,14 @@ static logical c_false = FALSE_; doublecomplex s; doublereal alpha, angle, realc; integer ipack, ioffg; - extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, + extern /* Subroutine */ void dscal_(integer *, doublereal *, doublereal *, integer *); extern logical lsame_(char *, char *); integer iinfo, idist, mnmin; doublecomplex extra; integer iskew; doublecomplex dummy, ztemp; - extern /* Subroutine */ int dlatm7_(integer *, doublereal *, integer *, + extern /* Subroutine */ void dlatm7_(integer *, doublereal *, integer *, integer *, integer *, doublereal *, integer *, integer *, integer *); integer ic, jc, nc, il; @@ -891,26 +891,27 @@ static logical c_false = FALSE_; integer iendch, ir, jr, ipackg, mr, minlda; extern doublereal dlarnd_(integer *, integer *); doublecomplex st; - extern /* Subroutine */ int zlagge_(integer *, integer *, integer *, + extern /* Subroutine */ void zlagge_(integer *, integer *, integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, doublecomplex *, integer *), zlaghe_(integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, - doublecomplex *, integer *), xerbla_(char *, integer *); + doublecomplex *, integer *); + extern int xerbla_(char *, integer *, ftnlen); integer ioffst, irsign; logical givens, iltemp; //extern /* Double Complex */ VOID zlarnd_(doublecomplex *, integer *, extern doublecomplex zlarnd_(integer *, integer *); - extern /* Subroutine */ int zlaset_(char *, integer *, integer *, + extern /* Subroutine */ void zlaset_(char *, integer *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *), zlartg_(doublecomplex *, doublecomplex *, doublereal *, doublecomplex *, doublecomplex *); logical ilextr; - extern /* Subroutine */ int zlagsy_(integer *, integer *, doublereal *, + extern /* Subroutine */ void zlagsy_(integer *, integer *, doublereal *, doublecomplex *, integer *, integer *, doublecomplex *, integer *) ; integer ir1, ir2, isympk; logical topdwn; - extern /* Subroutine */ int zlarot_(logical *, logical *, logical *, + extern /* Subroutine */ void zlarot_(logical *, logical *, logical *, integer *, doublecomplex *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, doublecomplex *); integer jch, llb, jkl, jku, uub; @@ -942,7 +943,7 @@ static logical c_false = FALSE_; /* Quick return if possible */ if (*m == 0 || *n == 0) { - return 0; + return; } /* Decode DIST */ @@ -1081,8 +1082,8 @@ static logical c_false = FALSE_; if (*info != 0) { i__1 = -(*info); - xerbla_("ZLATMT", &i__1); - return 0; + xerbla_("ZLATMT", &i__1, 6); + return; } /* Initialize random number generator */ @@ -1104,7 +1105,7 @@ static logical c_false = FALSE_; iinfo); if (iinfo != 0) { *info = 1; - return 0; + return; } /* Choose Top-Down if D is (apparently) increasing, */ @@ -1133,7 +1134,7 @@ static logical c_false = FALSE_; alpha = *dmax__ / temp; } else { *info = 2; - return 0; + return; } dscal_(rank, &alpha, &d__[1], &c__1); @@ -2046,7 +2047,7 @@ static logical c_false = FALSE_; if (iinfo != 0) { *info = 3; - return 0; + return; } } @@ -2221,7 +2222,7 @@ static logical c_false = FALSE_; } } - return 0; + return; /* End of ZLATMT */ diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index fd4e570484..1d44e94904 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -39,8 +39,12 @@ set(UNIT_SOURCES2 trti2/trti2_L.c ) +if (NOT RELAPACK_REPLACE) GenerateNamedObjects("${LAPACK_SOURCES}") GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) +else() +GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3) +endif() GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" false 3) GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) @@ -113,4 +117,3 @@ GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) add_library(lapack OBJECT ${OPENBLAS_SRC}) - diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c index 073a0251dd..d29ed588e9 100644 --- a/lapack/getf2/getf2_k.c +++ b/lapack/getf2/getf2_k.c @@ -37,6 +37,7 @@ /*********************************************************************/ #include +#include #include "common.h" static FLOAT dp1 = 1.; @@ -99,7 +100,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, jp--; temp1 = *(b + jp); - if (temp1 != ZERO) { + //if (temp1 != ZERO) { + if (fabs(temp1) >= DBL_MIN ) { temp1 = dp1 / temp1; if (jp != j) { diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c index 512adf8ba3..f3412f52ff 100644 --- a/lapack/getf2/zgetf2_k.c +++ b/lapack/getf2/zgetf2_k.c @@ -37,6 +37,7 @@ /*********************************************************************/ #include +#include #include "common.h" double fabs(double); @@ -105,8 +106,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, temp1 = *(b + jp * 2 + 0); temp2 = *(b + jp * 2 + 1); - if ((temp1 != ZERO) || (temp2 != ZERO)) { - + // if ((temp1 != ZERO) || (temp2 != ZERO)) { + if ((fabs(temp1) >= DBL_MIN) && (fabs(temp2) >= DBL_MIN)) { + if (jp != j) { SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, a + jp * 2, lda, NULL, 0); diff --git a/openblas.pc.in b/openblas.pc.in index ff849807c5..8ad6e8bee1 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: /~https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas +Libs: -L${libdir} -lopenblas${libsuffix} Libs.private: ${extralib} Cflags: -I${includedir} diff --git a/param.h b/param.h index dc02147d8e..f1f5cbdad2 100644 --- a/param.h +++ b/param.h @@ -79,6 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMM_DEFAULT_P 256 #define SBGEMM_DEFAULT_R 256 #define SBGEMM_DEFAULT_Q 256 +#define SBGEMM_ALIGN_K 1 // must be 2^x + #ifdef OPTERON #define SNUMOPT 4 @@ -2945,7 +2947,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(MIPS64_GENERIC) || defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2953,7 +2955,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#if defined(HAVE_MSA) && !defined(NO_MSA) +#if defined(HAVE_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -3365,6 +3367,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEV1) +#define SWITCH_RATIO 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3394,6 +3398,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) +#undef SBGEMM_ALIGN_K +#define SBGEMM_ALIGN_K 4 + #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N #define SBGEMM_DEFAULT_UNROLL_M 8 diff --git a/relapack/Makefile b/relapack/Makefile index ddf101bd15..056a0ee48a 100644 --- a/relapack/Makefile +++ b/relapack/Makefile @@ -1,53 +1,61 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system - +ifeq ($(RELAPACK_REPLACE),0) +RELAPREFIX=RELAPACK_ +INCLALL=-DINCLUDE_ALL=0 +else +INCLALL=-DINCLUDE_ALL=1 +endif SRC = $(wildcard src/*.c) SRC1 = \ - src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ - src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ - src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ - src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c + slauum.c clauum.c dlauum.c zlauum.c \ + strtri.c dtrtri.c ctrtri.c ztrtri.c \ + spotrf.c dpotrf.c cpotrf.c zpotrf.c \ + sgetrf.c dgetrf.c cgetrf.c zgetrf.c SRC2 = \ - src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ - src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ - src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ - src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ - src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ - src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ - src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ - src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ - src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c + cgbtrf.c cpbtrf.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \ + cgemmt.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \ + csytrf.c dgemmt.c dsytrf_rook_rec2.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \ + chegst.c csytrf_rec2.c dtgsyl.c strsyl.c zhetrf_rook.c ztgsyl.c \ + chetrf.c csytrf_rook.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \ + chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c ztrsyl_rec2.c \ + chetrf_rook.c ctgsyl.c ssygst.c zgbtrf.c zpbtrf.c \ + chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c \ + ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zsytrf.c SRCX = \ - src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ - src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ - src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ - src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ - src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ - src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ - src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ - src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ - src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c - -OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) -OBJS2 = $(SRC2:%.c=%.o) + cgbtrf.c cpbtrf.c ctrtri.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \ + cgemmt.c cpotrf.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \ + cgetrf.c csytrf.c dgemmt.c dsytrf_rook_rec2.c sgetrf.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \ + chegst.c csytrf_rec2.c dgetrf.c dtgsyl.c slauum.c strsyl.c zhetrf_rook.c ztgsyl.c \ + chetrf.c csytrf_rook.c dlauum.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \ + chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c spotrf.c strtri.c zlauum.c ztrsyl_rec2.c \ + chetrf_rook.c ctgsyl.c dpotrf.c dtrtri.c ssygst.c zgbtrf.c zpbtrf.c ztrtri.c \ + chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c zpotrf.c \ + clauum.c ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zgetrf.c zsytrf.c + + +OBJS1 = $(SRC1:%.c=src/$(RELAPREFIX)%.$(SUFFIX)) +OBJS2 = $(SRC2:%.c=src/$(RELAPREFIX)%.o) OBJS = $(OBJS1) $(OBJS2) TEST_SUITS = \ - slauum dlauum clauum zlauum \ - spotrf dpotrf cpotrf zpotrf \ - spbtrf dpbtrf cpbtrf zpbtrf \ - ssygst dsygst chegst zhegst \ - ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ - sgetrf dgetrf cgetrf zgetrf \ - sgbtrf dgbtrf cgbtrf zgbtrf \ - strsyl dtrsyl ctrsyl ztrsyl \ - stgsyl dtgsyl ctgsyl ztgsyl \ sgemmt dgemmt cgemmt zgemmt + + # slauum dlauum clauum zlauum \ + # spotrf dpotrf cpotrf zpotrf \ + # spbtrf dpbtrf cpbtrf zpbtrf \ + # ssygst dsygst chegst zhegst \ + # ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ + # sgetrf dgetrf cgetrf zgetrf \ + # sgbtrf dgbtrf cgbtrf zgbtrf \ + # strsyl dtrsyl ctrsyl ztrsyl \ + # stgsyl dtgsyl ctgsyl ztgsyl \ + TESTS = $(TEST_SUITS:%=test/%.pass) # dummies TEST_EXES = $(TEST_SUITS:%=test/%.x) @@ -63,11 +71,11 @@ libs: $(OBJS) $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) $(RANLIB) $(TOPDIR)/$(LIBNAME) -%.$(SUFFIX): %.c config.h - $(CC) $(CFLAGS) -c $< -o $@ +src/$(RELAPREFIX)%.$(SUFFIX): src/%.c relapack_config.h + $(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@ -%.o: %.c config.h - $(CC) $(CFLAGS) -c $< -o $@ +src/$(RELAPREFIX)%.o: src/%.c relapack_config.h + $(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@ # ReLAPACK testing diff --git a/relapack/config.h b/relapack/relapack_config.h similarity index 99% rename from relapack/config.h rename to relapack/relapack_config.h index 9d6919463e..ba428a61b0 100644 --- a/relapack/config.h +++ b/relapack/relapack_config.h @@ -45,7 +45,7 @@ // The following macros specify which routines are included in the library under // LAPACK's symbol names: 1 included, 0 not included -#define INCLUDE_ALL 1 +// #define INCLUDE_ALL 1 #define INCLUDE_XLAUUM INCLUDE_ALL #define INCLUDE_SLAUUM INCLUDE_XLAUUM @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 1 +#define INCLUDE_XGEMMT INCLUDE_ALL #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt index 2d861f54b2..b920894184 100644 --- a/relapack/src/CMakeLists.txt +++ b/relapack/src/CMakeLists.txt @@ -1,85 +1,86 @@ -include_directories(${PROJECT_SOURCE_DIR}) -include_directories(${PROJECT_BINARY_DIR}) - -set(RELAFILES -clauum.c -ctrsyl_rec2.c -dsytrf.c -spbtrf.c -strsyl_rec2.c -zhetrf_rook_rec2.c -ztrsyl.c -cgbtrf.c -cpbtrf.c -ctrtri.c -dsytrf_rec2.c -spotrf.c -strtri.c -zlauum.c -ztrsyl_rec2.c -cgemmt.c -cpotrf.c -dgbtrf.c -dsytrf_rook.c -lapack_wrappers.c -ssygst.c -zgbtrf.c -zpbtrf.c -ztrtri.c -cgetrf.c -csytrf.c -dgemmt.c -dsytrf_rook_rec2.c -ssytrf.c -zgemmt.c -zpotrf.c -chegst.c -csytrf_rec2.c -dgetrf.c -dtgsyl.c -ssytrf_rec2.c -zgetrf.c -zsytrf.c -chetrf.c -csytrf_rook.c -dlauum.c -dtrsyl.c -sgbtrf.c -ssytrf_rook.c -zhegst.c -zsytrf_rec2.c -chetrf_rec2.c -csytrf_rook_rec2.c -dpbtrf.c -dtrsyl_rec2.c -sgemmt.c -ssytrf_rook_rec2.c -zhetrf.c -zsytrf_rook.c -chetrf_rook.c -ctgsyl.c -dpotrf.c -dtrtri.c -sgetrf.c -stgsyl.c -zhetrf_rec2.c -zsytrf_rook_rec2.c -chetrf_rook_rec2.c -ctrsyl.c -dsygst.c -f2c.c -slauum.c -strsyl.c -zhetrf_rook.c -ztgsyl.c -) - - - -# add relapack folder to the sources -set(RELA_SOURCES "") -foreach (RELA_FILE ${RELAFILES}) - list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") -endforeach () -add_library(relapack_src OBJECT ${RELA_SOURCES}) -set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) +include_directories(${PROJECT_SOURCE_DIR}/relapack) + +set(RELAFILES +clauum.c +ctrsyl_rec2.c +dsytrf.c +spbtrf.c +strsyl_rec2.c +zhetrf_rook_rec2.c +ztrsyl.c +cgbtrf.c +cpbtrf.c +ctrtri.c +dsytrf_rec2.c +spotrf.c +strtri.c +zlauum.c +ztrsyl_rec2.c +cgemmt.c +cpotrf.c +dgbtrf.c +dsytrf_rook.c +lapack_wrappers.c +ssygst.c +zgbtrf.c +zpbtrf.c +ztrtri.c +cgetrf.c +csytrf.c +dgemmt.c +dsytrf_rook_rec2.c +ssytrf.c +zgemmt.c +zpotrf.c +chegst.c +csytrf_rec2.c +dgetrf.c +dtgsyl.c +ssytrf_rec2.c +zgetrf.c +zsytrf.c +chetrf.c +csytrf_rook.c +dlauum.c +dtrsyl.c +sgbtrf.c +ssytrf_rook.c +zhegst.c +zsytrf_rec2.c +chetrf_rec2.c +csytrf_rook_rec2.c +dpbtrf.c +dtrsyl_rec2.c +sgemmt.c +ssytrf_rook_rec2.c +zhetrf.c +zsytrf_rook.c +chetrf_rook.c +ctgsyl.c +dpotrf.c +dtrtri.c +sgetrf.c +stgsyl.c +zhetrf_rec2.c +zsytrf_rook_rec2.c +chetrf_rook_rec2.c +ctrsyl.c +dsygst.c +f2c.c +slauum.c +strsyl.c +zhetrf_rook.c +ztgsyl.c +) + + + +# add relapack folder to the sources +set(RELA_SOURCES "") +foreach (RELA_FILE ${RELAFILES}) + list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") +endforeach () +add_library(relapack_src OBJECT ${RELA_SOURCES}) +set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c index 556491c7a1..674d737097 100644 --- a/relapack/src/ctrsyl_rec2.c +++ b/relapack/src/ctrsyl_rec2.c @@ -10,7 +10,7 @@ http://www.netlib.org/f2c/libf2c.zip */ -#include "../config.h" +#include "relapack_config.h" #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h index 38c5c30d09..44652a0749 100644 --- a/relapack/src/relapack.h +++ b/relapack/src/relapack.h @@ -1,7 +1,7 @@ #ifndef RELAPACK_INT_H #define RELAPACK_INT_H #include -#include "../../config.h" +#include "config.h" #if defined(OS_WINDOWS) && defined(__64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; @@ -9,7 +9,7 @@ typedef unsigned long long BLASULONG; typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#include "../config.h" +#include "relapack_config.h" #include "../inc/relapack.h" diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c index edc6ffc6bd..d07a4e8de9 100644 --- a/relapack/src/ztrsyl_rec2.c +++ b/relapack/src/ztrsyl_rec2.c @@ -10,7 +10,7 @@ http://www.netlib.org/f2c/libf2c.zip */ -#include "../config.h" +#include "relapack_config.h" #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index a2c358cfa7..276fecae9d 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -76,9 +76,9 @@ float16to32 (bfloat16_bits f16) int main (int argc, char *argv[]) { - int m, n, k; + blasint m, n, k; int i, j, l; - int x; + blasint x; int ret = 0; int loop = 100; char transA = 'N', transB = 'N'; @@ -112,7 +112,6 @@ main (int argc, char *argv[]) &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) - for (l = 0; l < k; l++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) ret++; if (transA == 'N' && transB == 'N') @@ -126,7 +125,6 @@ main (int argc, char *argv[]) } for (i = 0; i < n; i++) for (j = 0; j < m; j++) - for (l = 0; l < k; l++) if (CC[i * m + j] != DD[i * m + j]) ret++; }